In [12]:
# =========================
# Import necessary dependencies
# =========================
import os, re, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

import time, urllib.parse, requests, unicodedata

from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix


import warnings
warnings.filterwarnings("ignore")

# -----------------------------
# Configuration
# -----------------------------
DATA_PATH = "medical_abstract.csv"   # <-- change if needed
TEXT_COL  = "Medical_Abstract"
LABEL_COL = "Category_Name"

TABLE_DIR = "result/tables"
PLOT_DIR  = "result/plots"
SEP = "=" * 80

os.makedirs(TABLE_DIR, exist_ok=True)
os.makedirs(PLOT_DIR, exist_ok=True)

# Plot aesthetics & dimensions
plt.rcParams["figure.figsize"] = (9, 5)
plt.rcParams["axes.titlesize"] = 14
plt.rcParams["axes.labelsize"] = 12
plt.rcParams["xtick.labelsize"] = 10
plt.rcParams["ytick.labelsize"] = 10
plt.rcParams["legend.fontsize"] = 10
plt.rcParams["grid.alpha"] = 0.25
plt.rcParams["axes.prop_cycle"] = plt.cycler(color=[
    "#4E79A7", "#F28E2B", "#E15759", "#76B7B2", "#59A14F",
    "#EDC948", "#B07AA1", "#FF9DA7", "#9C755F", "#BAB0AC"
])

# -----------------------------
#Customized Functions
# -----------------------------
def _round_df(df: pd.DataFrame, ndigits: int = 3) -> pd.DataFrame:
    out = df.copy()
    for c in out.columns:
        if pd.api.types.is_numeric_dtype(out[c]):
            out[c] = out[c].astype(float).round(ndigits)
    return out

def _show_and_save(fig, out_path_png):
    plt.tight_layout()
    plt.show()
    fig.savefig(out_path_png, dpi=1200, bbox_inches="tight")
    print(SEP)

# -----------------------------
# Load
# -----------------------------
df = pd.read_csv(DATA_PATH)
df[TEXT_COL] = df[TEXT_COL].astype(str)

# -----------------------------
# Null checks
# -----------------------------
n_rows = len(df)
null_df = df.isna().sum().to_frame("null_count")
null_df["null_frac_%"] = (null_df["null_count"] / n_rows * 100).round(3)
display(_round_df(null_df.reset_index().rename(columns={"index": "column"})))
print(SEP)

# -----------------------------
# Duplicate checks
# -----------------------------
dup_rows_mask = df.duplicated(keep=False)  # exact row duplicates
dup_text_mask = df[TEXT_COL].duplicated(keep=False)  # identical text, regardless of label

dup_summary = pd.DataFrame({
    "metric": ["duplicate_rows", "duplicate_texts"],
    "count": [int(dup_rows_mask.sum()), int(dup_text_mask.sum())],
    "frac_%": [round(dup_rows_mask.mean() * 100, 3), round(dup_text_mask.mean() * 100, 3)]
})
display(_round_df(dup_summary))
print(SEP)

# -----------------------------
# Text duplicates across categories (cross-label)
# -----------------------------
# Group identical texts; counting distinct labels; keeping those having >1 label
g_text = df.groupby(TEXT_COL)
group_size = g_text.size().rename("group_size")
n_labels = g_text[LABEL_COL].nunique().rename("n_labels")

meta = pd.concat([group_size, n_labels], axis=1).reset_index()
cross_label = meta[(meta["group_size"] > 1) & (meta["n_labels"] > 1)].copy()

# Building a compact table with a short snippet and counts per label
def _label_counts_dict(grp):
    return grp[LABEL_COL].value_counts().to_dict()

lbl_counts = g_text.apply(_label_counts_dict).rename("label_counts").reset_index()
cross_label = cross_label.merge(lbl_counts, on=TEXT_COL, how="left")
cross_label["text_snippet"] = cross_label[TEXT_COL].str.slice(0, 160)

# Displaying top 15 largest cross-label groups
cross_label_view = cross_label.sort_values("group_size", ascending=False)[
    ["text_snippet", "group_size", "n_labels", "label_counts"]
].head(15)
display(cross_label_view)
print(SEP)

# -----------------------------
# Label distribution
# -----------------------------
label_counts = (
    df[LABEL_COL].value_counts()
      .rename_axis("Category")
      .to_frame("Count")
      .reset_index()
)
label_counts["Frac_%"] = (label_counts["Count"] / n_rows * 100).round(3)
display(_round_df(label_counts))
print(SEP)

# Save table
_round_df(label_counts).to_csv(os.path.join(TABLE_DIR, "label_distribution.csv"), index=False)

# -----------------------------
# Length stats & overview
# -----------------------------
df["_char_len"] = df[TEXT_COL].str.len()
df["_word_len"] = df[TEXT_COL].str.split().map(len)

length_stats = pd.DataFrame([{
    "count": len(df),
    "char_mean": df["_char_len"].mean(),
    "char_std": df["_char_len"].std(),
    "char_min": df["_char_len"].min(),
    "char_25%": df["_char_len"].quantile(0.25),
    "char_50%": df["_char_len"].quantile(0.50),
    "char_75%": df["_char_len"].quantile(0.75),
    "char_max": df["_char_len"].max(),
    "word_mean": df["_word_len"].mean(),
    "word_std": df["_word_len"].std(),
    "word_min": df["_word_len"].min(),
    "word_25%": df["_word_len"].quantile(0.25),
    "word_50%": df["_word_len"].quantile(0.50),
    "word_75%": df["_word_len"].quantile(0.75),
    "word_max": df["_word_len"].max(),
}])
display(_round_df(length_stats))
print(SEP)

# Overview metrics
short_thresh_words = 30
long_thresh_words  = int(np.ceil(df["_word_len"].quantile(0.99)))
df["_is_short"] = df["_word_len"] < short_thresh_words
df["_is_long"]  = df["_word_len"] >= long_thresh_words

rows_in_crosslabel_groups = int(
    df[TEXT_COL].isin(set(cross_label[TEXT_COL].values)).sum()
)

overview = pd.DataFrame({
    "metric": [
        "n_rows", "n_cols",
        "duplicate_rows", "duplicate_texts",
        "crosslabel_groups", "rows_in_crosslabel_groups",
        "short_texts_<30w", f"very_long_texts_>=P99w({long_thresh_words})"
    ],
    "value": [
        len(df), df.shape[1],
        int(dup_rows_mask.sum()), int(dup_text_mask.sum()),
        int(len(cross_label)), rows_in_crosslabel_groups,
        int(df["_is_short"].sum()), int(df["_is_long"].sum())
    ]
})
display(_round_df(overview))
print(SEP)

# Save tables
_round_df(length_stats).to_csv(os.path.join(TABLE_DIR, "text_length_stats.csv"), index=False)
_round_df(overview).to_csv(os.path.join(TABLE_DIR, "dataset_overview.csv"), index=False)

# -----------------------------
# Plot: Category distribution
# -----------------------------
fig = plt.figure()
cats = label_counts["Category"].tolist()
vals = label_counts["Count"].tolist()
bars = plt.bar(range(len(cats)), vals)
plt.xticks(range(len(cats)), cats, rotation=30, ha='right')
plt.ylabel("Count")
plt.title(f"Category Distribution (N = {len(df)})")
for i, b in enumerate(bars):
    h = b.get_height()
    pct = vals[i] / len(df) * 100
    plt.text(b.get_x() + b.get_width()/2, h, f"{int(h)} ({pct:.3f}%)",
             ha='center', va='bottom', fontsize=9)
plt.grid(True, axis='y')
_show_and_save(fig, os.path.join(PLOT_DIR, "category_distribution.png"))

# -----------------------------
# Plot: Word-count boxplot by category
# -----------------------------
fig2 = plt.figure()
order_by_median = (
    df.groupby(LABEL_COL)["_word_len"]
      .median()
      .sort_values(ascending=False)
      .index.tolist()
)
data_per_cat = [df[df[LABEL_COL] == c]["_word_len"].values for c in order_by_median]
plt.boxplot(data_per_cat, labels=order_by_median, showmeans=True)
plt.xticks(rotation=30, ha='right')
plt.ylabel("Word Count")
plt.title("Word Count by Category (with means)")
plt.grid(True, axis='y')
_show_and_save(fig2, os.path.join(PLOT_DIR, "word_count_by_category_boxplot.png"))

In [14]:
# ----- Displaying and exporting the identified text level duplicates ------
duplicates = df[df.duplicated(subset=[TEXT_COL], keep=False)]
sorted_duplicates = duplicates.sort_values(by=[TEXT_COL, LABEL_COL])
sorted_duplicates.head(20)
sorted_duplicates = sorted_duplicates[['Category', 'Medical_Abstract', 'Category_Name', '_char_len', '_word_len']]
sorted_duplicates.to_csv(os.path.join(TABLE_DIR, "sorted_duplicates.csv"), index=False)
sorted_duplicates.head(20)

In [15]:
# =============================================================================
# Conservative deduplication (i.e not retaining any of the duplicate instance)
# =============================================================================

def normalize_unicode_ws(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# --- Conservative dedup (drop whole duplicate groups) ---
# Build normalized key (keep text intact; this is only for duplicate detection)
df["_norm_key"] = df[TEXT_COL].map(normalize_unicode_ws)

key_counts = df["_norm_key"].value_counts()
unique_mask = df["_norm_key"].map(key_counts) == 1
df_after = df[unique_mask].copy()

# Drop ultra-short AFTER dedup
df_after["_word_len"] = df_after[TEXT_COL].str.split().map(len)
df_after = df_after[df_after["_word_len"] >= 30].copy()

# --- AFTER label distribution & overview ---
n_after = len(df_after)
label_counts_after = (
    df_after[LABEL_COL].value_counts()
      .rename_axis("Category")
      .to_frame("Count")
      .reset_index()
)
label_counts_after["Frac_%"] = (label_counts_after["Count"] / n_after * 100).round(3)
display(_round_df(label_counts_after))

overview_after = pd.DataFrame({
    "metric": ["n_rows_after", "n_cols", "dropped_duplicate_groups", "dropped_ultra_short_<30w"],
    "value": [
        int(n_after),
        int(df_after.shape[1]),
        int((key_counts > 1).sum()),
        int((df["_norm_key"].map(key_counts) == 1).sum() - n_after)  # uniques before short-drop minus kept
    ]
})
display(_round_df(overview_after))

# save tables
_round_df(label_counts_after).to_csv(os.path.join(TABLE_DIR, "label_distribution_after_conservative_dedup.csv"), index=False)
_round_df(overview_after).to_csv(os.path.join(TABLE_DIR, "dataset_overview_after_conservative_dedup.csv"), index=False)

In [16]:
# ====================================================
# Plotting before Vs after to inspect interim dataset
# ====================================================

def plot_before_after_grouped(dist_before: pd.DataFrame, n_before: int,
                              dist_after: pd.DataFrame, n_after: int,
                              title: str, outfile: str,
                              sort_by: str = "after"):
    # union of categories
    cats = sorted(set(dist_before["Category"]).union(set(dist_after["Category"])))
    b_map = dict(zip(dist_before["Category"], dist_before["Count"]))
    a_map = dict(zip(dist_after["Category"],  dist_after["Count"]))
    before = np.array([b_map.get(c, 0) for c in cats], dtype=int)
    after  = np.array([a_map.get(c, 0) for c in cats], dtype=int)

    # sorting
    if sort_by.lower() == "before":
        order = np.argsort(-before)
    elif sort_by.lower() == "total":
        order = np.argsort(-(before + after))
    else:  # "after"
        order = np.argsort(-after)
    cats_sorted   = [cats[i] for i in order]
    before_sorted = before[order]
    after_sorted  = after[order]

    # y positions and bar geometry
    y = np.arange(len(cats_sorted))
    h = 0.38 #bar thickness
    y_before = y - h/2
    y_after  = y + h/2

    # figure
    fig, ax = plt.subplots(figsize=(9.5, 4.5))
    color_before = "#4E79A7"
    color_after  = "#28A197"

    # bars (horizontal)
    b_plot = ax.barh(y_before, before_sorted, height=h, label="Before", color=color_before)
    a_plot = ax.barh(y_after,  after_sorted,  height=h, label="After",  color=color_after)

    # axes & title
    ax.set_yticks(y)
    ax.set_yticklabels(cats_sorted)
    ax.invert_yaxis()
    ax.set_xlabel("Count")
    ax.set_ylabel("")
    ax.grid(True, axis="x", alpha=0.25)

    # combined title with sample sizes
    ax.set_title(f"{title}\nBefore (N={n_before}) | After (N={n_after})", pad=12)

    # annotations (count and % at the *front* of bars)
    max_val = max(before_sorted.max() if len(before_sorted) else 0,
                  after_sorted.max()  if len(after_sorted)  else 0)
    x_pad = max(1, int(max_val * 0.01))

    for rect, cnt in zip(b_plot, before_sorted):
        pct = (cnt / n_before * 100) if n_before else 0.0
        x = rect.get_width()
        ytxt = rect.get_y() + rect.get_height()/2
        ax.text(x + x_pad, ytxt, f"{int(cnt)} ({pct:.3f}%)",
                va="center", ha="left", fontsize=9)

    for rect, cnt in zip(a_plot, after_sorted):
        pct = (cnt / n_after * 100) if n_after else 0.0
        x = rect.get_width()
        ytxt = rect.get_y() + rect.get_height()/2
        ax.text(x + x_pad, ytxt, f"{int(cnt)} ({pct:.3f}%)",
                va="center", ha="left", fontsize=9)

    ax.set_xlim(0, max_val * 1.20 + x_pad)
    ax.legend(loc="center left", bbox_to_anchor=(1.02, 0.5), frameon=True)

    plt.tight_layout()
    _show_and_save(fig, os.path.join(PLOT_DIR, outfile))

plot_before_after_grouped(
    label_counts_before := label_counts.copy(), len(df),
    label_counts_after, n_after,
    title="Category Distribution: Conservative Deduplication + Drop <30 tokens",
    outfile="category_distribution_grouped_before_after.png",
    sort_by="after")

In [17]:
# ==================
# PubMed spot-check
# ==================

# --- Config ---
ENABLE_PUBMED_CHECK = True
PUBMED_SAMPLE_N = 100
SNIPPET_WORDS   = 12
PUBMED_BASE     = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_DB       = "pubmed"
PUBMED_RETMODE  = "json"
PUBMED_RETMAX   = 1
PUBMED_SLEEP    = 0.50 # ~2 req/s
PUBMED_API_KEY  = None # put your NCBI API key here if you have one
NCBI_TOOL       = "reliability-check"
NCBI_EMAIL      = "your_email@example.com"  # recommended by NCBI

# --- Building stopword for this block ---
STOPWORDS = {
    "the","a","an","of","and","or","to","in","on","for","with","by","as","at","from",
    "is","are","was","were","be","being","been","that","this","these","those","it","its",
    "we","they","their","our","you","your","he","she","his","her","which","who","whom"
}

def _net_ping():
    try:
        r = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=pubmed",
                         timeout=10, headers={"User-Agent":"qa-spotcheck/1.0"})
        print("NCBI eutils reachable, status:", r.status_code)
    except Exception as e:
        print("NCBI eutils NOT reachable:", repr(e))

# --- Whitespace & Unicode normalization ---
def _clean_ws(s: str) -> str:
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# --- First-N words for snippet creation ---
def _first_n_words(s, n=SNIPPET_WORDS):
    return " ".join(s.split()[:n])

# --- Construct PubMed AND-term query from a snippet ---
def _and_terms(snippet, k=6):
    toks = re.findall(r"[A-Za-z0-9]+", snippet.lower())
    toks = [t for t in toks if t not in STOPWORDS]
    toks = toks[:k]
    if not toks:
        return None
    return " AND ".join(f"{t}[Title/Abstract]" for t in toks)

 # --- eSearch wrapper (NCBI Entrez) ---
def _esearch(term, retmax=1):
    params = {
        "db": PUBMED_DB,
        "retmode": PUBMED_RETMODE,
        "retmax": str(retmax),
        "term": term,
        "tool": NCBI_TOOL,
        "email": NCBI_EMAIL
    }
    if PUBMED_API_KEY:
        params["api_key"] = PUBMED_API_KEY
    url = PUBMED_BASE + "?" + urllib.parse.urlencode(params, safe='"[] ')
    r = requests.get(url, headers={"User-Agent": "qa-spotcheck/1.0"}, timeout=20)
    if r.status_code != 200:
        return r.status_code, []
    data = r.json()
    return r.status_code, data.get("esearchresult", {}).get("idlist", [])

# --- Matching strategy cascade for a snippet ---
def _try_match(snippet, debug=False):
    s = _clean_ws(snippet).replace('"', "")
    # Tier 1: exact 12 words in Title/Abstract
    t1 = f"\"{s}\"[Title/Abstract]"
    code, ids = _esearch(t1)
    if debug: print("exact12:", code, "ids:", len(ids))
    if ids: return True, ids[0], "exact12", code

    # Tier 2: exact 8 words in Title/Abstract
    w = s.split()
    if len(w) > 8:
        s8 = " ".join(w[:8])
        t2 = f"\"{s8}\"[Title/Abstract]"
        code, ids = _esearch(t2)
        if debug: print("exact8:", code, "ids:", len(ids))
        if ids: return True, ids[0], "exact8", code

    # Tier 3: AND of 6 content terms
    t3 = _and_terms(s, k=6)
    if t3:
        code, ids = _esearch(t3)
        if debug: print("and6:", code, "ids:", len(ids), "|", t3[:120])
        if ids: return True, ids[0], "and6", code

    # Tier 4: exact phrase without field tag (fallback)
    t4 = f"\"{s}\""
    code, ids = _esearch(t4)
    if debug: print("nofield:", code, "ids:", len(ids))
    if ids: return True, ids[0], "nofield", code

    return False, "", "nomatch", code

# --- Run spot-check on the deduplicated data ---
if not ENABLE_PUBMED_CHECK:
    print("PubMed spot-check disabled (ENABLE_PUBMED_CHECK=False).")
else:
    _net_ping()

    if "df_after" not in globals() or df_after.empty:
        print("No df_after found or it's empty. Did you run the dedup cell?")
    else:
        sample_n = min(PUBMED_SAMPLE_N, len(df_after))
        sample_df = df_after.sample(n=sample_n, random_state=42)[[TEXT_COL, LABEL_COL]].reset_index(drop=True)
        sample_df["snippet"] = sample_df[TEXT_COL].apply(lambda s: _first_n_words(s, n=SNIPPET_WORDS))

        # --- Try to match snippets to PubMed ---
        results = []
        for i, row in sample_df.iterrows():
            ok, pmid, how, code = _try_match(row["snippet"], debug=(i < 5))  # debug first 5
            results.append({
                "snippet": row["snippet"],
                "label": row[LABEL_COL],
                "match": bool(ok),
                "pmid": pmid,
                "strategy": how,
                "http_status": code
            })
            time.sleep(PUBMED_SLEEP)

        # --- Summarize & display ---
        res_df = _round_df(pd.DataFrame(results))
        match_rate = float(res_df["match"].mean() * 100.0) if len(res_df) else 0.0
        summary_df = _round_df(pd.DataFrame([{
            "sample_n": int(len(res_df)),
            "matches": int(res_df["match"].sum()),
            "match_rate_%": round(match_rate, 3)
        }]))

        print("\nPubMed spot-check results (with debug on first 5):")
        display(res_df)
        print("\nPubMed spot-check summary:")
        display(summary_df)

        #exports
        res_df.to_csv(os.path.join(TABLE_DIR, "pubmed_spotcheck_results.csv"), index=False)
        summary_df.to_csv(os.path.join(TABLE_DIR, "pubmed_spotcheck_summary.csv"), index=False)

In [18]:
summary = (
    res_df.assign(kind=res_df.apply(lambda r: "rate_limited_429" if (r["http_status"]==429 and not r["match"])
                              else ("true_nomatch" if (r["http_status"]==200 and not r["match"])
                              else ("matched" if r["match"] else "other")), axis=1))
      .groupby("kind").size().rename("n").reset_index()
)
summary["pct_%"] = (summary["n"] / len(res_df) * 100).round(3)
summary

In [19]:
# ====================================================================
# Diagnostic test for abstract labels: checking for diffusing level
# ====================================================================

# ---------- working corpus (deduplicated) ----------
df_dedup = df_after.copy()

def _tokenize_words(s: str):
    return re.findall(r"[A-Za-z]+", str(s).lower())

def _row_norm(xr: csr_matrix) -> csr_matrix:
    denom = np.sqrt((xr.multiply(xr)).sum())
    return xr if denom == 0 else xr / denom


# ======================================================================
# 1) Cross-label duplicate membership by class (RAW df)
# ======================================================================
if "cross_label" not in globals():
    g_text = df.groupby(TEXT_COL)
    group_size = g_text.size().rename("group_size")
    n_labels = g_text[LABEL_COL].nunique().rename("n_labels")
    meta = pd.concat([group_size, n_labels], axis=1).reset_index()
    cross_label_local = meta[(meta["group_size"] > 1) & (meta["n_labels"] > 1)].copy()
else:
    cross_label_local = cross_label.copy()

cross_texts = set(cross_label_local[TEXT_COL].tolist())
mask_cross = df[TEXT_COL].isin(cross_texts)
per_class_cross = (
    df.loc[mask_cross, LABEL_COL].value_counts()
      .rename_axis("Category").to_frame("Count").reset_index()
)
per_class_cross["Frac_in_crosslabel_%"] = (per_class_cross["Count"] / mask_cross.sum() * 100).round(3)

display(_round_df(per_class_cross))
_round_df(per_class_cross).to_csv(os.path.join(TABLE_DIR, "gpc_justification_crosslabel.csv"), index=False)

fig = plt.figure()
cats = per_class_cross["Category"].tolist()
vals = per_class_cross["Count"].tolist()
bars = plt.bar(range(len(cats)), vals)
plt.xticks(range(len(cats)), cats, rotation=30, ha='right')
plt.ylabel("Rows in cross-label duplicate groups")
plt.title("Cross-label duplicate membership by class (RAW)")
for i, b in enumerate(bars):
    h = b.get_height()
    pct = vals[i] / mask_cross.sum() * 100 if mask_cross.sum() else 0.0
    plt.text(b.get_x()+b.get_width()/2, h, f"{int(h)} ({pct:.3f}%)", ha='center', va='bottom', fontsize=9)
plt.grid(True, axis='y')
_show_and_save(fig, os.path.join(PLOT_DIR, "crosslabel_membership_by_class.png"))

# ======================================================================
# 2) Pairwise Jaccard similarity (token overlap) on df_dedup
# ======================================================================
cat_tokens = {}
for cat, grp in df_dedup.groupby(LABEL_COL):
    toks = set()
    for t in grp[TEXT_COL]:
        toks.update(_tokenize_words(t))
    cat_tokens[cat] = toks

cats_sorted = sorted(cat_tokens.keys())
recs = []
for i in range(len(cats_sorted)):
    for j in range(i+1, len(cats_sorted)):
        c1, c2 = cats_sorted[i], cats_sorted[j]
        inter = len(cat_tokens[c1] & cat_tokens[c2])
        union = max(1, len(cat_tokens[c1] | cat_tokens[c2]))
        recs.append({"Category_1": c1, "Category_2": c2, "Jaccard": round(inter/union, 3)})
jacc_df = pd.DataFrame(recs).sort_values(["Category_1","Category_2"]).reset_index(drop=True)
display(_round_df(jacc_df))
_round_df(jacc_df).to_csv(os.path.join(TABLE_DIR, "jaccard_pairs_after_dedup.csv"), index=False)

# heatmap
mat = np.zeros((len(cats_sorted), len(cats_sorted)))
for i, ci in enumerate(cats_sorted):
    for j, cj in enumerate(cats_sorted):
        if i==j:
            mat[i,j] = 1.0
        else:
            a, b = (ci, cj) if ci < cj else (cj, ci)
            v = jacc_df[(jacc_df["Category_1"]==a)&(jacc_df["Category_2"]==b)]["Jaccard"].values
            mat[i,j] = float(v[0]) if len(v) else 0.0

fig = plt.figure(figsize=(7.5,6.5))
plt.imshow(mat, cmap="YlGnBu", vmin=0, vmax=1)
plt.title("Category Jaccard Similarity (deduplicated corpus)")
plt.xticks(range(len(cats_sorted)), cats_sorted, rotation=45, ha='right')
plt.yticks(range(len(cats_sorted)), cats_sorted)
for i in range(len(cats_sorted)):
    for j in range(len(cats_sorted)):
        plt.text(j, i, f"{mat[i,j]:.3f}", ha='center', va='center', fontsize=7)
plt.colorbar(label="Jaccard")
_show_and_save(fig, os.path.join(PLOT_DIR, "jaccard_heatmap_after_dedup.png"))

# ======================================================================
# 3) Centroid margin separability (unsupervised, TF-IDF word 1–2)
# ======================================================================
vec = TfidfVectorizer(analyzer="word", ngram_range=(1,2), min_df=2, max_features=100_000)
X = vec.fit_transform(df_dedup[TEXT_COL].astype(str))
y = df_dedup[LABEL_COL].astype(str).values
classes = sorted(np.unique(y))

# Build L2-normalized class centroids in **dense** ndarray form
centroids = {}
for c in classes:
    rows = (y == c)
    if rows.sum() == 0:
        centroids[c] = None
        continue
    v = np.asarray(X[rows].mean(axis=0)).ravel() # mean -> ndarray (1D)
    nrm = np.linalg.norm(v)
    centroids[c] = v if nrm == 0 else v / nrm

# Compute cosine similarities to each centroid; margin = top1 - top2
margins = []
X_csr = X.tocsr()
for i in range(X_csr.shape[0]):
    xv = X_csr[i].toarray().ravel()
    nrm = np.linalg.norm(xv)
    xv = xv if nrm == 0 else xv / nrm
    sims = []
    for c in classes:
        centroid = centroids[c]
        sim = float(np.dot(xv, centroid)) if centroid is not None else 0.0
        sims.append((c, sim))
    sims.sort(key=lambda t: t[1], reverse=True)
    top1 = sims[0][1]
    top2 = sims[1][1] if len(sims) > 1 else 0.0
    margins.append({"Category": y[i], "margin": round(top1 - top2, 3)})

margin_df = pd.DataFrame(margins)
margin_stats = (
    margin_df.groupby("Category")["margin"]
             .agg(["count","mean","median","std","min","max"])
             .reset_index()
)
display(_round_df(margin_stats))
_round_df(margin_stats).to_csv(os.path.join(TABLE_DIR, "centroid_margin_stats.csv"), index=False)

# Boxplot
fig = plt.figure(figsize=(9.5,5.5))
order = margin_df.groupby("Category")["margin"].median().sort_values(ascending=False).index.tolist()
data  = [margin_df.loc[margin_df["Category"]==c, "margin"].values for c in order]
plt.boxplot(data, labels=order, showmeans=True)
plt.xticks(rotation=30, ha='right')
plt.ylabel("Centroid margin (top1 - top2 cosine sim)")
plt.title("Unsupervised separability by class (higher = clearer boundary)")
plt.grid(True, axis='y')
_show_and_save(fig, os.path.join(PLOT_DIR, "centroid_margin_boxplot_by_class.png"))


# ======================================================================
# Lexical exclusivity via log-odds (Monroe et al.) — z>2 distinctive tokens
# ======================================================================
# --- Token counts per class
def word_counts_per_class(df_in, text_col, label_col):
    class_counts = defaultdict(Counter)
    total_tokens = {}
    docs_tokens = defaultdict(list)  # store token lists per doc for concentration
    for cat, grp in df_in.groupby(label_col):
        cnt = Counter()
        n_tokens = 0
        for txt in grp[text_col].astype(str):
            toks = _tokenize_words(txt)
            docs_tokens[cat].append(toks)
            cnt.update(toks)
            n_tokens += len(toks)
        class_counts[cat] = cnt
        total_tokens[cat] = n_tokens
    return class_counts, total_tokens, docs_tokens

class_counts, total_tokens, docs_tokens = word_counts_per_class(df_dedup, TEXT_COL, LABEL_COL)
classes = sorted(class_counts.keys())
vocab = sorted({w for c in classes for w in class_counts[c]})
bg = Counter()
for c in classes: bg.update(class_counts[c])

# Monroe et al. (2008) log-odds with informative Dirichlet prior
alpha = 0.01  # symmetric prior

def _log_odds_z_for_class(target):
    n1 = sum(class_counts[target].values())
    n2 = sum((bg - class_counts[target]).values())
    out = {}
    for w in vocab:
        c1 = class_counts[target][w]
        c2 = bg[w] - c1
        p1 = (c1 + alpha) / (n1 + alpha * len(vocab))
        p2 = (c2 + alpha) / (n2 + alpha * len(vocab))
        var = (1.0 / (c1 + alpha)) + (1.0 / (c2 + alpha))
        # log-odds difference with tiny guards
        denom1 = max(1e-12, 1 - p1)
        denom2 = max(1e-12, 1 - p2)
        delta = np.log(p1 / denom1) - np.log(p2 / denom2)
        z = delta / np.sqrt(var)
        out[w] = z
    return out

zs_by_class = {c: _log_odds_z_for_class(c) for c in classes}

# --- Aggregate exclusivity: count tokens with z > 2 per class ---
exclusivity = []
for c in classes:
    zmap = _log_odds_z_for_class(c)
    distinctive = sum(1 for z in zmap.values() if z > 2.0)
    exclusivity.append({"Category": c, "distinctive_token_count_z>2": distinctive})
excl_df = pd.DataFrame(exclusivity).sort_values("distinctive_token_count_z>2", ascending=False)
 # --- Display & export exclusivity table ---
display(_round_df(excl_df))
_round_df(excl_df).to_csv(os.path.join(TABLE_DIR, "lexical_exclusivity_summary.csv"), index=False)

fig = plt.figure()
cats = excl_df["Category"].tolist()
vals = excl_df["distinctive_token_count_z>2"].tolist()
bars = plt.bar(range(len(cats)), vals)
plt.xticks(range(len(cats)), cats, rotation=30, ha='right')
plt.ylabel("Distinctive tokens (z-score > 2)")
plt.title("Lexical exclusivity by class")
for i, b in enumerate(bars):
    plt.text(b.get_x()+b.get_width()/2, b.get_height(), f"{int(vals[i])}", ha='center', va='bottom', fontsize=9)
plt.grid(True, axis='y')
_show_and_save(fig, os.path.join(PLOT_DIR, "lexical_exclusivity_bar.png"))

# =============================
# 4) Distinctive types per 10k tokens (z > 2)
# =============================
rows = []
for c in classes:
    n_tokens = max(1, total_tokens[c])
    n_distinctive = sum(1 for _, z in zs_by_class[c].items() if z > 2.0)
    per10k = n_distinctive / n_tokens * 10_000.0
    rows.append({
        "Category": c,
        "total_tokens": int(n_tokens),
        "distinctive_types_z>2": int(n_distinctive),
        "distinctive_types_per10k": round(per10k, 3),
    })

per10k_df = pd.DataFrame(rows).sort_values("distinctive_types_per10k", ascending=False)
display(_round_df(per10k_df))

# save table
_per10k_path = os.path.join(TABLE_DIR, "lexical_exclusivity_per10k.csv")
_round_df(per10k_df).to_csv(_per10k_path, index=False)

# plot bar
fig = plt.figure()
cats = per10k_df["Category"].tolist()
vals = per10k_df["distinctive_types_per10k"].tolist()
bars = plt.bar(range(len(cats)), vals)
plt.xticks(range(len(cats)), cats, rotation=30, ha='right')
plt.ylabel("Distinctive types per 10k tokens (z > 2)")
plt.title("Normalized lexical exclusivity by class")
for i, b in enumerate(bars):
    h = b.get_height()
    plt.text(b.get_x()+b.get_width()/2, h, f"{h:.3f}", ha='center', va='bottom', fontsize=9)
plt.grid(True, axis='y')
_show_and_save(fig, os.path.join(PLOT_DIR, "lexical_exclusivity_per10k_bar.png"))

# ===========================================
# 5. Cohesion & Distinctiveness Concentration
# ===========================================
# own-centroid cosine per doc
X_csr = X.tocsr()
own_sim = []
for i in range(X_csr.shape[0]):
    xv = X_csr[i].toarray().ravel()
    nrm = np.linalg.norm(xv)
    xv = xv if nrm == 0 else xv / nrm
    sim = float(np.dot(xv, centroids[y[i]]))
    own_sim.append({"Category": y[i], "own_centroid_cosine": sim})

cohesion_df = pd.DataFrame(own_sim).groupby("Category")["own_centroid_cosine"]\
    .agg(["count","mean","median","std","min","max"]).reset_index()
display(_round_df(cohesion_df))
cohesion_path = os.path.join(TABLE_DIR, "cohesion_by_class.csv")
_round_df(cohesion_df).to_csv(cohesion_path, index=False)

# plot cohesion (mean cosine)
fig = plt.figure()
order = cohesion_df.sort_values("mean", ascending=False)["Category"].tolist()
vals  = cohesion_df.set_index("Category").loc[order, "mean"].values
bars = plt.bar(range(len(order)), vals)
plt.xticks(range(len(order)), order, rotation=30, ha='right')
plt.ylabel("Mean own-centroid cosine")
plt.title("Intra-class cohesion (higher = more coherent)")
for i, b in enumerate(bars):
    h = b.get_height()
    plt.text(b.get_x()+b.get_width()/2, h, f"{h:.3f}", ha='center', va='bottom', fontsize=9)
plt.grid(True, axis='y')
_show_and_save(fig, os.path.join(PLOT_DIR, "cohesion_mean_bar.png"))

# =============================
# 6) Top-K most distinctive tokens per class via log-odds
# =============================
TOP_K = 500  # can tune (100–1000)
rows = []
for c in classes:
    zmap = zs_by_class[c]
    topk = {w for w,_ in sorted(zmap.items(), key=lambda t: t[1], reverse=True)[:TOP_K]}
    doc_fracs = []
    for toks in docs_tokens[c]:
        if len(toks) == 0:
            doc_fracs.append(0.0)
        else:
            doc_fracs.append(sum(1 for t in toks if t in topk) / len(toks))
    rows.append({
        "Category": c,
        "TopK": TOP_K,
        "mean_fraction_topK": np.mean(doc_fracs),
        "median_fraction_topK": np.median(doc_fracs),
    })

conc_df = pd.DataFrame(rows)
conc_df[["mean_fraction_topK","median_fraction_topK"]] = conc_df[["mean_fraction_topK","median_fraction_topK"]]*100
conc_df = conc_df.sort_values("mean_fraction_topK", ascending=False)
display(_round_df(conc_df))
conc_path = os.path.join(TABLE_DIR, "distinctiveness_concentration.csv")
_round_df(conc_df).to_csv(conc_path, index=False)

# plot concentration (mean % of tokens in Top-K)
fig = plt.figure()
cats = conc_df["Category"].tolist()
vals = conc_df["mean_fraction_topK"].tolist()
bars = plt.bar(range(len(cats)), vals)
plt.xticks(range(len(cats)), cats, rotation=30, ha='right')
plt.ylabel("% tokens in class Top-K distinctive list")
plt.title(f"Distinctiveness concentration (Top-{TOP_K}) — higher = more focused")
for i, b in enumerate(bars):
    h = b.get_height()
    plt.text(b.get_x()+b.get_width()/2, h, f"{h:.3f}%", ha='center', va='bottom', fontsize=9)
plt.grid(True, axis='y')
_show_and_save(fig, os.path.join(PLOT_DIR, "distinctiveness_concentration_bar.png"))

print(SEP)
print("Saved:")
print(" -", os.path.join(PLOT_DIR, "lexical_exclusivity_per10k_bar.png"))
print(" -", cohesion_path)
print(" -", conc_path)
print(" -", os.path.join(PLOT_DIR, "cohesion_mean_bar.png"))
print(" -", os.path.join(PLOT_DIR, "distinctiveness_concentration_bar.png"))
print("Tables/plots saved to result/ and displayed above.")

In [20]:
# ========================================================================
# Drop the 'General Pathological Conditions' and save final class labels
# ========================================================================

EXCLUDED_CLASS = "General Pathological Conditions"

# Filter out GPC and keep only selected columns
df_4class = df_after[df_after[LABEL_COL] != EXCLUDED_CLASS].copy()

# Select the three columns of interest
df_4class = df_4class[['Category', 'Medical_Abstract', 'Category_Name']]

# Save to CSV
out_path = os.path.join(TABLE_DIR, "deduplicated_medical_abstract.csv")
df_4class.to_csv(out_path, index=False, encoding="utf-8")

print(f"Saved 4-class deduplicated corpus: {out_path}")
print(f"Rows: {len(df_4class)} | Columns: {list(df_4class.columns)}")

In [21]:
# ========================================================================
# OPTIONAL: Save results(i.e Plots and tables) to google drive
# ========================================================================

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import shutil
from pathlib import Path
from typing import Tuple

# ---- CONFIGURE YOUR PATHS HERE ----

# === SOURCE FOLDERS ===
CSV_SRC_DIR = '/content/result/tables'
PNG_SRC_DIR = '/content/result/plots'

# Destination root in Drive
DRIVE_ROOT = '/content/drive/MyDrive/' # <------ Adjust filepath to your desired destination
CSV_DST_DIR = os.path.join(DRIVE_ROOT, '02_EDA_CSV')
PNG_DST_DIR = os.path.join(DRIVE_ROOT, '01_EDA_Plots')


def format_bytes(n: int) -> str:
    for unit in ['B','KB','MB','GB','TB']:
        if n < 1024:
            return f"{n:.2f} {unit}"
        n /= 1024
    return f"{n:.2f} PB"

def copy_folder_contents(src: str, dst: str) -> Tuple[int, int]:
    """
    Recursively copy the *contents* of src into destination.
    - Creates directories as needed
    - Overwrites files in destination if they already exist
    """
    src = os.path.abspath(src)
    dst = os.path.abspath(dst)

    if not os.path.exists(src):
        print(f"Source not found: {src}")
        return (0, 0)

    os.makedirs(dst, exist_ok=True)
    files_copied = 0
    bytes_copied = 0

    for dirpath, dirnames, filenames in os.walk(src):
        rel = os.path.relpath(dirpath, src)
        target_dir = dst if rel == '.' else os.path.join(dst, rel)
        os.makedirs(target_dir, exist_ok=True)

        for fname in filenames:
            s = os.path.join(dirpath, fname)
            d = os.path.join(target_dir, fname)

            # Ensure overwrite: if target exists as a file, remove it first
            if os.path.isfile(d):
                try:
                    os.remove(d)
                except Exception as e:
                    print(f"Could not remove existing file before overwrite: {d} ({e})")

            # Copy with metadata (copy2); will overwrite when path is the full file target
            shutil.copy2(s, d)
            files_copied += 1
            try:
                bytes_copied += os.path.getsize(s)
            except Exception:
                pass

    return (files_copied, bytes_copied)

# Run copies
print("Ensuring destination folders exist on Drive...")
os.makedirs(CSV_DST_DIR, exist_ok=True)
os.makedirs(PNG_DST_DIR, exist_ok=True)

print(f"\n Copying CSV folder:\n  SRC: {CSV_SRC_DIR}\n  DST: {CSV_DST_DIR}")
csv_count, csv_bytes = copy_folder_contents(CSV_SRC_DIR, CSV_DST_DIR)
print(f"CSV copy complete: {csv_count} files, {format_bytes(csv_bytes)}")

print(f"\n Copying PNG folder:\n  SRC: {PNG_SRC_DIR}\n  DST: {PNG_DST_DIR}")
png_count, png_bytes = copy_folder_contents(PNG_SRC_DIR, PNG_DST_DIR)
print(f"PNG copy complete: {png_count} files, {format_bytes(png_bytes)}")

# Quick verification - show first few items
def preview_dir(p: str, max_items: int = 20):
    try:
        items = sorted(os.listdir(p))[:max_items]
        print(f"\n {p} (showing up to {max_items} items):")
        for it in items:
            print("  -", it)
    except Exception as e:
        print(f" Could not list {p}: {e}")

preview_dir(CSV_DST_DIR)
preview_dir(PNG_DST_DIR)

print("\n Done. Drive now has up-to-date copies (overwritten where names matched).")

In [10]:
# ========================================================================
# Retrieving Library versions for reproducibility
# ========================================================================

import importlib
import pkg_resources

# List of libraries you've imported
imported_libraries = [
    're', 'json', 'numpy', 'pandas', 'matplotlib', 'IPython',
    'unicodedata', 'requests', 'sklearn', 'scipy'
]

# Dictionary to store versions
versioned_packages = {}

for lib in imported_libraries:
    try:
        # Use pkg_resources to get the version if available
        version = pkg_resources.get_distribution(lib).version
        versioned_packages[lib] = version
    except Exception:
        try:
            # Fall back to importing and checking __version__
            module = importlib.import_module(lib)
            version = getattr(module, '__version__', None)
            if version:
                versioned_packages[lib] = version
        except Exception:
            pass

# Create requirements-style lines
requirements_lines = [f"{lib}=={version}" for lib, version in sorted(versioned_packages.items())]
print(requirements_lines)

# Write to a requirements.txt file
with open("requirements.txt", "w") as f:
    f.write("\n".join(requirements_lines))

['IPython==7.34.0', 'json==2.0.9', 'matplotlib==3.10.0', 'numpy==2.0.2', 'pandas==2.2.2', 're==2.2.1', 'requests==2.32.4', 'scipy==1.16.2', 'sklearn==1.6.1']
