In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Install libraries for PDF text extraction
!pip install -q pypdfium2 PyMuPDF PyPDF2 tqdm

In [3]:
import pypdfium2 as pdfium
import fitz  # PyMuPDF
import PyPDF2
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from collections import Counter
import re
import time

# ------------- CONFIG -------------
# Folder with your PDFs
PDF_FOLDER = Path("/content/drive/MyDrive/CapstoneProject/Capstone/papers")

# Where to save extracted text files
TEXT_OUTPUT = Path("/content/drive/MyDrive/CapstoneProject/Capstone/text_extraction_new")

# Ground-truth organ DOI logs
LIVER_SOURCE  = "/content/drive/MyDrive/CapstoneProject/Capstone/download_log_liver_transplant.csv"
LUNG_SOURCE   = "/content/drive/MyDrive/CapstoneProject/Capstone/download_log_lung_transplant.csv"
HEART_SOURCE  = "/content/drive/MyDrive/CapstoneProject/Capstone/download_log_heart_transplant.csv"
KIDNEY_SOURCE = "/content/drive/MyDrive/CapstoneProject/Capstone/download_log_kidney_transplant.csv"

# Final metadata file (single file, same columns as old one)
METADATA_CSV = "/content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_clean.csv"

TEXT_OUTPUT.mkdir(parents=True, exist_ok=True)

all_pdfs = sorted(list(PDF_FOLDER.glob("**/*.pdf")))
print(f"Total PDFs found: {len(all_pdfs)}")

Total PDFs found: 3831


In [4]:
def filename_to_doi_key(filename: str) -> str:
    """
    Use the PDF filename (without .pdf) as the DOI key.
    Example: '10.1186_1471-2482-8-2.pdf' -> '10.1186_1471-2482-8-2'
    """
    return filename.replace(".pdf", "").strip()


def normalize_doi_from_log_to_key(doi: str) -> str | None:
    """
    Normalize DOIs from log files to match the underscore DOI key format.

    Example:
      '10.1186/1471-2482-8-2'   -> '10.1186_1471-2482-8-2'
      '10.1172/JCI100208,'      -> '10.1172_jci100208'
    """
    if pd.isna(doi):
        return None
    s = str(doi).strip().lower()
    s = s.rstrip(",")
    s = s.replace("/", "_")
    return s

In [5]:
def parse_source_file(content: str):
    """
    Parse concatenated 'doi success' / 'doi failed' style content.
    Pattern: (10.\S+?)(success|failed)
    """
    pattern = r"(10\.\S+?)(success|failed)"
    matches = re.findall(pattern, content, re.IGNORECASE)
    return [(doi, status) for doi, status in matches]

print("\n BUILDING DOI_KEY → ORGAN CATEGORY MAP FROM SOURCE LOGS \n")

source_files = {
    "liver":  LIVER_SOURCE,
    "lung":   LUNG_SOURCE,
    "heart":  HEART_SOURCE,
    "kidney": KIDNEY_SOURCE,
}

doi_key_to_organ = {}
organ_stats = {k: 0 for k in source_files.keys()}

for organ, filepath in source_files.items():
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()
        doi_status_pairs = parse_source_file(content)

        success_count = 0
        for doi, status in doi_status_pairs:
            if status.lower() == "success":
                key = normalize_doi_from_log_to_key(doi)
                if key:
                    doi_key_to_organ[key] = organ
                    success_count += 1

        organ_stats[organ] = success_count
        print(f"{organ:10} : {success_count:4d} successful downloads (from {len(doi_status_pairs)} total)")
    except FileNotFoundError:
        print(f"{organ:10} : File not found at {filepath}")
    except Exception as e:
        print(f"{organ:10} : Error reading {filepath} - {e}")

print(f"\nTotal unique DOI keys in mapping: {len(doi_key_to_organ)}")


 BUILDING DOI_KEY → ORGAN CATEGORY MAP FROM SOURCE LOGS 



  Pattern: (10.\S+?)(success|failed)


liver      : 5649 successful downloads (from 71316 total)
lung       : 1185 successful downloads (from 18588 total)
heart      :  965 successful downloads (from 14303 total)
kidney     : 2264 successful downloads (from 38502 total)

Total unique DOI keys in mapping: 3829


In [6]:
# ROBUST TEXT EXTRACTION

def extract_text_pdfium(pdf_path: Path) -> str:
    chunks = []
    pdf = pdfium.PdfDocument(str(pdf_path))
    for i in range(len(pdf)):
        page = pdf[i]
        textpage = page.get_textpage()
        page_text = textpage.get_text_range()
        if page_text:
            chunks.append(page_text)
    pdf.close()
    return "\n\n".join(chunks)


def extract_text_pymupdf(pdf_path: Path) -> str:
    chunks = []
    doc = fitz.open(str(pdf_path))
    for page in doc:
        page_text = page.get_text()
        if page_text:
            chunks.append(page_text)
    doc.close()
    return "\n\n".join(chunks)


def extract_text_pypdf2(pdf_path: Path) -> str:
    chunks = []
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                chunks.append(page_text)
    return "\n\n".join(chunks)


def extract_text_robust(pdf_path: Path):
    """
    Try multiple extractors in order.
    Returns (text, method_used).
    """
    # 1) pdfium
    try:
        text = extract_text_pdfium(pdf_path)
        if text.strip():
            return text, "pdfium"
    except Exception:
        pass

    # 2) PyMuPDF
    try:
        text = extract_text_pymupdf(pdf_path)
        if text.strip():
            return text, "pymupdf"
    except Exception:
        pass

    # 3) PyPDF2
    try:
        text = extract_text_pypdf2(pdf_path)
        if text.strip():
            return text, "pypdf2"
    except Exception:
        pass

    return "", "failed"

In [7]:
# TITLE EXTRACTION

TITLE_BAD_PATTERNS = re.compile(
    r"(journal|volume|issue|no\.|number|suppl|doi|dx\.doi|http|www\.|copyright|©|"
    r"received|accepted|submitted|revised|open access|license|issn|printed|"
    r"correspondence to|author information|keywords?:|references?|contents?)",
    re.IGNORECASE,
)

def looks_like_author_line(line: str) -> bool:
    """
    Heuristic for author/affiliation lines.
    """
    # many commas + initials like "J. Smith, A. Kumar, ..."
    if line.count(",") >= 2 and re.search(r"\b[A-Z]\.\b", line):
        return True

    # affiliation keywords
    if re.search(r"(department|university|hospital|institute|school of|centre|center|clinic)",
                 line, re.IGNORECASE):
        return True

    # emails
    if "@" in line or "email" in line.lower():
        return True

    return False


def get_candidate_indices(lines):
    """
    Choose candidate indices to look for titles:
    - lines within a window above ABSTRACT / SUMMARY
    - else, first ~350 lines
    """
    indices = set()
    n = len(lines)

    abstract_idxs = [
        i for i, l in enumerate(lines)
        if re.match(r"^\s*abstract\b", l, re.IGNORECASE)
    ]
    summary_idxs = [
        i for i, l in enumerate(lines)
        if re.match(r"^\s*summary\b", l, re.IGNORECASE)
    ]

    for idx in abstract_idxs + summary_idxs:
        start = max(0, idx - 12)
        end = max(0, idx)
        for j in range(start, end):
            indices.add(j)

    # Fallback: first 350 lines
    if not indices:
        for j in range(min(350, n)):
            indices.add(j)

    return sorted(indices)


def extract_title_from_text_strict(text: str) -> str:
    """
    Stricter, deeper title search:
    - scans up to 2000 non-empty lines
    - prefers lines above ABSTRACT/SUMMARY
    - heavily filters non-title junk
    """
    if not text:
        return "Unknown"

    raw_lines = text.splitlines()
    lines = [l.strip() for l in raw_lines if l.strip()]
    if not lines:
        return "Unknown"

    if len(lines) > 2000:
        lines = lines[:2000]

    candidate_indices = get_candidate_indices(lines)
    candidates = []

    for idx in candidate_indices:
        line = lines[idx]

        # length filter
        if not (30 <= len(line) <= 220):
            continue

        # avoid obvious non-title stuff
        if TITLE_BAD_PATTERNS.search(line):
            continue

        # avoid section headers
        if re.match(r"^(abstract|introduction|materials and methods|methods|results|discussion)\b",
                    line, re.IGNORECASE):
            continue

        # digit-heavy lines (issue info, tables)
        digit_ratio = sum(c.isdigit() for c in line) / len(line)
        if digit_ratio > 0.25:
            continue

        # author/affiliation
        if looks_like_author_line(line):
            continue

        score = 0

        # earlier index better
        score += max(0, 400 - idx)

        # bonus if near an abstract below
        if any(
            re.match(r"^\s*abstract\b", lines[j], re.IGNORECASE)
            for j in range(idx + 1, min(idx + 5, len(lines)))
        ):
            score += 20

        # titles often don't end with '.'
        if not line.endswith("."):
            score += 5

        # penalize ALL CAPS
        letters_only = re.sub(r"[^A-Za-z]+", "", line)
        if letters_only and letters_only.isupper():
            score -= 10

        # penalize journal-like "Journal, 1985, 69, 320-325" format
        if re.search(r"\b(19|20)\d{2}\b", line) and "," in line and any(
            w in line.lower()
            for w in ["journal", "bmj", "lancet", "gut", "ophthalmology"]
        ):
            score -= 15

        candidates.append((score, line))

    if not candidates:
        # fallback: any reasonable line near top
        for line in lines[:100]:
            if 20 < len(line) < 200:
                return line
        return "Unknown"

    best_score, best_line = max(candidates, key=lambda x: x[0])
    return best_line

In [8]:
# EXTRACT YEAR AND PUBLICATION

def extract_year_simple(text: str):
    """
    Simple year extractor:
    - scan first 5000 chars
    - pick the most frequent 4-digit year between 1950 and 2025
    """
    if not text:
        return None
    header = text[:5000]
    years = []
    for match in re.findall(r"\b(19[5-9]\d|20[0-2]\d)\b", header):
        y = int(match)
        if 1950 <= y <= 2025:
            years.append(y)
    if not years:
        return None
    return Counter(years).most_common(1)[0][0]


def extract_publication_simple(text: str):
    """
    Tiny heuristic to guess a journal-name-like line.
    """
    if not text:
        return None

    header = text[:6000]
    lines = [l.strip() for l in header.splitlines() if l.strip()]

    for line in lines[:80]:
        if len(line) < 20 or len(line) > 200:
            continue

        # contain journal-related keyword
        if re.search(r"\bjournal\b", line, re.IGNORECASE) or any(
            kw in line.lower()
            for kw in ["bmj", "lancet", "gut", "ophthalmology", "hepatology", "kidney", "circulation"]
        ):
            # avoid obvious normal sentences
            if line.count(".") > 1:
                continue
            return line

    return None

In [9]:
metadata_records = []
start_time = time.time()

print("\n BUILDING METADATA (BETTER TITLES + GROUND-TRUTH CATEGORIES) \n")

for pdf_path in tqdm(all_pdfs, desc="Processing PDFs", unit="pdf"):
    doi_key = filename_to_doi_key(pdf_path.name)  # stored as 'doi'
    txt_file = TEXT_OUTPUT / f"{doi_key}.txt"

    record = {
        "pdf_title": "Unknown",
        "doi": doi_key,   # NOTE: underscore DOI key here
        "file_size_mb": round(pdf_path.stat().st_size / (1024 * 1024), 3),
        "text_length": 0,
        "is_scanned": False,
        "needs_ocr": False,
        "extraction_method": "none",
        "year": None,
        "citation_count": None,
        "publication": None,
        "category": None,
    }

    # --- Get text (reuse if exists) ---
    if txt_file.exists():
        try:
            with open(txt_file, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
            method = "existing_txt"
        except Exception:
            text = ""
            method = "failed_existing"
    else:
        text, method = extract_text_robust(pdf_path)
        try:
            with open(txt_file, "w", encoding="utf-8") as f:
                f.write(text)
        except Exception:
            pass

    record["extraction_method"] = method
    record["text_length"] = len(text.strip())

    # Mark potential scanned docs for DeepSeek OCR later
    if record["text_length"] < 200 and method != "failed":
        record["is_scanned"] = True
        record["needs_ocr"] = True

    # Title extraction
    if record["text_length"] > 0:
        record["pdf_title"] = extract_title_from_text_strict(text)

        # Year & publication – best effort
        yr = extract_year_simple(text)
        if yr is not None:
            record["year"] = float(yr)
        pub = extract_publication_simple(text)
        if pub:
            record["publication"] = pub

    # Category from DOI logs
    record["category"] = doi_key_to_organ.get(doi_key, "unassigned")

    metadata_records.append(record)

elapsed = (time.time() - start_time) / 60
print(f"\nProcessed {len(metadata_records)} PDFs in {elapsed:.1f} minutes.")


 BUILDING METADATA (BETTER TITLES + GROUND-TRUTH CATEGORIES) 



Processing PDFs: 100%|██████████| 3831/3831 [00:59<00:00, 64.32pdf/s] 



Processed 3831 PDFs in 1.0 minutes.


In [10]:
# SAVE METADATA
metadata_df = pd.DataFrame(metadata_records)
metadata_df.to_csv(METADATA_CSV, index=False)

print("\n METADATA SAVED \n")
print(f"Path : {METADATA_CSV}")
print(f"Rows : {len(metadata_df)}")
print("\nPreview:")
print(metadata_df.head(10).to_string(index=False))


 METADATA SAVED 

Path : /content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_clean.csv
Rows : 3831

Preview:
                                                                                 pdf_title                            doi  file_size_mb  text_length  is_scanned  needs_ocr extraction_method   year citation_count                                                                           publication   category
                                                                 Preparing to download ... 10_1002_14651858_CD005055_pub3         0.001           54        True       True      existing_txt    NaN           None                                                                                  None unassigned
                                                                 Preparing to download ... 10_1002_14651858_CD006660_pub3         0.001           54        True       True      existing_txt    NaN           None                                                    

In [11]:
from pathlib import Path
import pandas as pd
import numpy as np

NEW_METADATA_CSV = "/content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_clean.csv"
OLD_METADATA_CSV = "/content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_final.csv"

print("Loading new metadata from:", NEW_METADATA_CSV)
md_new = pd.read_csv(NEW_METADATA_CSV)
print("New metadata shape:", md_new.shape)

print("\nLoading old metadata from:", OLD_METADATA_CSV)
md_old = pd.read_csv(OLD_METADATA_CSV)
print("Old metadata shape:", md_old.shape)

print("\nNew columns:", md_new.columns.tolist())
print("Old columns:", md_old.columns.tolist())

Loading new metadata from: /content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_clean.csv
New metadata shape: (3831, 11)

Loading old metadata from: /content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_final.csv
Old metadata shape: (3746, 11)

New columns: ['pdf_title', 'doi', 'file_size_mb', 'text_length', 'is_scanned', 'needs_ocr', 'extraction_method', 'year', 'citation_count', 'publication', 'category']
Old columns: ['pdf_title', 'doi', 'file_size_mb', 'text_length', 'is_scanned', 'needs_ocr', 'extraction_method', 'year', 'citation_count', 'publication', 'category']


In [12]:
# Make sure we don't lose anything
print("\nExample DOIs:")
print("New:", md_new["doi"].head(3).tolist())
print("Old:", md_old["doi"].head(3).tolist())

# Create a unified key in both DataFrames
md_new["doi_key"] = md_new["doi"].astype(str).str.strip()

md_old["doi_key"] = (
    md_old["doi"]
    .astype(str)
    .str.strip()
    .str.replace("/", "_", regex=False)  # 10/1002/... -> 10_1002_...
)

print("\nUnique doi_key counts:")
print("New:", md_new["doi_key"].nunique())
print("Old:", md_old["doi_key"].nunique())


Example DOIs:
New: ['10_1002_14651858_CD005055_pub3', '10_1002_14651858_CD006660_pub3', '10_1002_14651858_CD010985_pub4']
Old: ['10/1136/bjo/69/5/320', '10/1136/bmj/2/6203/1461', '10/1172/JCI109506']

Unique doi_key counts:
New: 3831
Old: 3746


In [13]:
# Build mapping dicts from old metadata
old_by_key = md_old.set_index("doi_key")

year_map        = old_by_key["year"].to_dict()
cites_map       = old_by_key["citation_count"].to_dict()
pub_map         = old_by_key["publication"].to_dict()
category_map    = old_by_key["category"].to_dict()

print("Mapping sizes:")
print("year       :", len(year_map))
print("citations  :", len(cites_map))
print("publication:", len(pub_map))
print("category   :", len(category_map))

Mapping sizes:
year       : 3746
citations  : 3746
publication: 3746
category   : 3746


In [14]:
before_missing = {
    "year": md_new["year"].isna().sum(),
    "citation_count": md_new["citation_count"].isna().sum(),
    "publication": md_new["publication"].isna().sum(),
    "category_unassigned": (md_new["category"].isna() | (md_new["category"] == "unassigned")).sum(),
}

print("Missing BEFORE merge:")
for k, v in before_missing.items():
    print(f"  {k:18s}: {v}")

# Temporary columns pulling from maps
md_new["year_old"]        = md_new["doi_key"].map(year_map)
md_new["cites_old"]       = md_new["doi_key"].map(cites_map)
md_new["pub_old"]         = md_new["doi_key"].map(pub_map)
md_new["category_old"]    = md_new["doi_key"].map(category_map)

# ---- YEAR ----
mask_year = md_new["year"].isna() & md_new["year_old"].notna()
md_new.loc[mask_year, "year"] = md_new.loc[mask_year, "year_old"]

# ---- CITATION COUNT ----
mask_cites = md_new["citation_count"].isna() & md_new["cites_old"].notna()
md_new.loc[mask_cites, "citation_count"] = md_new.loc[mask_cites, "cites_old"]

# ---- PUBLICATION ----
def is_empty_pub(x):
    if pd.isna(x):
        return True
    s = str(x).strip()
    return s == "" or s.lower() == "none"

mask_pub = md_new["publication"].apply(is_empty_pub) & md_new["pub_old"].notna()
md_new.loc[mask_pub, "publication"] = md_new.loc[mask_pub, "pub_old"]

# ---- CATEGORY ----
mask_cat = (md_new["category"].isna() | (md_new["category"] == "unassigned")) & md_new["category_old"].notna()
md_new.loc[mask_cat, "category"] = md_new.loc[mask_cat, "category_old"]

# Enforce numeric types for year and citation_count
md_new["year"] = pd.to_numeric(md_new["year"], errors="coerce").astype("float64")
md_new["citation_count"] = pd.to_numeric(md_new["citation_count"], errors="coerce").astype("float64")

after_missing = {
    "year": md_new["year"].isna().sum(),
    "citation_count": md_new["citation_count"].isna().sum(),
    "publication": md_new["publication"].isna().sum(),
    "category_unassigned": (md_new["category"].isna() | (md_new["category"] == "unassigned")).sum(),
}

print("\nMissing AFTER merge:")
for k, v in after_missing.items():
    print(f"  {k:18s}: {v}")

print("\nImprovements:")
for k in before_missing:
    print(f"  {k:18s}: {before_missing[k]} → {after_missing[k]}  (Δ = {before_missing[k] - after_missing[k]})")

Missing BEFORE merge:
  year              : 209
  citation_count    : 3831
  publication       : 2131
  category_unassigned: 3831

Missing AFTER merge:
  year              : 172
  citation_count    : 3761
  publication       : 713
  category_unassigned: 85

Improvements:
  year              : 209 → 172  (Δ = 37)
  citation_count    : 3831 → 3761  (Δ = 70)
  publication       : 2131 → 713  (Δ = 1418)
  category_unassigned: 3831 → 85  (Δ = 3746)


In [15]:
# Drop temp columns we used for merge
md_new = md_new.drop(columns=["doi_key", "year_old", "cites_old", "pub_old", "category_old"], errors="ignore")

# Sanity check: dtypes
print("\nFinal dtypes:")
print(md_new.dtypes)

# Save back to same CSV (or a new one if you want to be safe)
FINAL_METADATA_CSV = "/content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_clean.csv"
md_new.to_csv(FINAL_METADATA_CSV, index=False)

print(f"\n Merged metadata saved to: {FINAL_METADATA_CSV}")
print(f"Rows: {len(md_new)}")


Final dtypes:
pdf_title             object
doi                   object
file_size_mb         float64
text_length            int64
is_scanned              bool
needs_ocr               bool
extraction_method     object
year                 float64
citation_count       float64
publication           object
category              object
dtype: object

 Merged metadata saved to: /content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_clean.csv
Rows: 3831


In [16]:
print("="*80)
print("SPOT-CHECK: rows where we filled year/publication/category from old metadata")
print("="*80)

# Reload old with normalized key so we can join if needed
md_old["doi_key"] = (
    md_old["doi"]
    .astype(str)
    .str.strip()
    .str.replace("/", "_", regex=False)
)

old_indexed = md_old.set_index("doi_key")

sample_mask = (
    (md_new["doi"].isin(old_indexed.index)) &
    (
        md_new["year"].notna() |
        md_new["publication"].notna() |
        md_new["citation_count"].notna() |
        (md_new["category"] != "unassigned")
    )
)

sample = md_new[sample_mask].head(10).copy()

for _, row in sample.iterrows():
    key = row["doi"]
    print("\n------------------------------")
    print("DOI key        :", key)
    print("NEW title      :", row["pdf_title"][:80])
    print("NEW year       :", row["year"])
    print("NEW cites      :", row["citation_count"])
    print("NEW publication:", row["publication"])
    print("NEW category   :", row["category"])

    if key in old_indexed.index:
        o = old_indexed.loc[key]
        print("OLD year       :", o.get("year", None))
        print("OLD cites      :", o.get("citation_count", None))
        print("OLD publication:", (str(o.get("publication", ""))[:80] if pd.notna(o.get("publication", None)) else None))
        print("OLD category   :", o.get("category", None))

SPOT-CHECK: rows where we filled year/publication/category from old metadata

------------------------------
DOI key        : 10_1002_14651858_CD005055_pub3
NEW title      : Preparing to download ...
NEW year       : nan
NEW cites      : nan
NEW publication: nan
NEW category   : heart
OLD year       : nan
OLD cites      : nan
OLD publication: None
OLD category   : heart

------------------------------
DOI key        : 10_1002_14651858_CD006660_pub3
NEW title      : Preparing to download ...
NEW year       : nan
NEW cites      : nan
NEW publication: nan
NEW category   : lung
OLD year       : nan
OLD cites      : nan
OLD publication: None
OLD category   : lung

------------------------------
DOI key        : 10_1002_14651858_CD010985_pub4
NEW title      : Better health. Cochrane Database of Systematic Reviews
NEW year       : 2020.0
NEW cites      : nan
NEW publication: sickle cell disease (Review)
NEW category   : liver
OLD year       : 2020.0
OLD cites      : nan
OLD publication: sickl

In [17]:
from google.colab import drive
drive.mount('/content/drive')

# Install libraries for PDF text extraction
!pip install -q pypdfium2 PyMuPDF PyPDF2 tqdm

import fitz  # PyMuPDF
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import re
from collections import Counter

# CONFIG

PDF_FOLDER = Path("/content/drive/MyDrive/CapstoneProject/Capstone/papers")

INPUT_METADATA_CSV  = "/content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_clean.csv"
OUTPUT_METADATA_CSV = "/content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_layout_fixed.csv"

# LOAD METADATA

md = pd.read_csv(INPUT_METADATA_CSV)
print("Loaded metadata:", md.shape)

if "doi" not in md.columns:
    raise ValueError("Metadata must have a 'doi' column (underscore-style keys).")

md["doi"] = md["doi"].astype(str).str.strip()

# PATTERNS

TITLE_BAD_PATTERNS = re.compile(
    r"(journal|volume|issue|no\.|number|suppl|supplement|pages|pp\.|doi|dx\.doi|http|www\.|"
    r"copyright|©|received|accepted|submitted|revised|open access|license|issn|printed|references?|"
    r"table\s+\d+|figure\s+\d+|contents?|index)",
    re.IGNORECASE,
)

SECTION_HEADER_PAT = re.compile(
    r"^(abstract|introduction|background|methods?|materials and methods|results?|discussion|conclusions?)\b",
    re.IGNORECASE,
)

AFFILIATION_PAT = re.compile(
    r"(department|university|hospital|institute|school of|faculty|centre|center|clinic|address)",
    re.IGNORECASE,
)

PUBLISHER_KEYWORDS = re.compile(
    r"(published by|publisher[: ]|publishing|press\b|wiley|elsevier|springer|sage publications|"
    r"oxford university press|cambridge university press)",
    re.IGNORECASE,
)

COPYRIGHT_LINE = re.compile(
    r"(?:©|copyright)\s*(?:\d{4})?.*",
    re.IGNORECASE,
)

YEAR_PATTERN = re.compile(r"\b(19[5-9]\d|20[0-2]\d)\b")



# CORE HELPERS

def extract_title_from_spans(spans):
    if not spans:
        return None

    spans_sorted = sorted(spans, key=lambda s: (s["page"], s["y"], s["x"]))
    sizes = sorted({round(s["size"], 1) for s in spans}, reverse=True)
    if not sizes:
        return None

    top_sizes = sizes[:3]

    groups = {}
    for s in spans_sorted:
        if s["size"] not in top_sizes:
            continue
        row_key = int(round(s["y"] / 3.0))
        key = (s["page"], row_key, s["size"])
        groups.setdefault(key, []).append(s)

    candidates = []
    for (page, row, size), gspans in groups.items():
        gspans_sorted = sorted(gspans, key=lambda s: s["x"])
        text = " ".join(gs["text"] for gs in gspans_sorted).strip()

        if len(text) < 15 or len(text) > 250:
            continue
        if TITLE_BAD_PATTERNS.search(text):
            continue
        if SECTION_HEADER_PAT.search(text):
            continue
        if AFFILIATION_PAT.search(text):
            continue

        score = 0
        score += size * 2
        score += max(0, 100 - page * 30 - row * 0.5)

        if 40 <= len(text) <= 150:
            score += 15
        elif 25 <= len(text) < 40 or 150 < len(text) <= 220:
            score += 5
        else:
            score -= 5

        letters_only = re.sub(r"[^A-Za-z]+", "", text)
        if letters_only and letters_only.isupper():
            score -= 10

        digit_ratio = sum(c.isdigit() for c in text) / len(text)
        if digit_ratio > 0.25:
            score -= 8

        candidates.append((score, page, row, size, text))

    if not candidates:
        for s in spans_sorted[:60]:
            t = s["text"].strip()
            if 20 < len(t) < 200 and not TITLE_BAD_PATTERNS.search(t):
                return t
        return None

    best = max(candidates, key=lambda x: x[0])
    return best[-1]


def extract_publisher_from_lines(lines):
    if not lines:
        return None

    publisher_candidates = []
    for line in lines[:80]:
        if PUBLISHER_KEYWORDS.search(line) or COPYRIGHT_LINE.search(line):
            publisher_candidates.append(line.strip())

    if not publisher_candidates:
        return None

    pb = [l for l in publisher_candidates if "published by" in l.lower()]
    if pb:
        return min(pb, key=len)

    return min(publisher_candidates, key=len)


def extract_year_from_lines(lines, publisher_line=None):
    candidates = []

    def add_years_from_text(txt, weight=1):
        if not txt:
            return
        for m in YEAR_PATTERN.findall(txt):
            y = int(m)
            if 1950 <= y <= 2025:
                candidates.extend([y] * weight)

    if publisher_line:
        add_years_from_text(publisher_line, weight=3)

    for line in lines[:100]:
        low = line.lower()
        if "published" in low or "copyright" in low or "©" in line:
            add_years_from_text(line, weight=2)

    for line in lines[:80]:
        add_years_from_text(line, weight=1)

    if not candidates:
        return None

    cnt = Counter(candidates)
    most_common = cnt.most_common()
    top_freq = most_common[0][1]
    top_years = [y for y, f in most_common if f == top_freq]
    return min(top_years)


def extract_from_pdf_layout_safe(pdf_path: Path):
    """
    Safe wrapper:
      - skips very large PDFs (>50 MB)
      - only first page
      - handles all exceptions
    """
    # skip insane files
    size_mb = pdf_path.stat().st_size / (1024 * 1024)
    if size_mb > 50:
        return None, None, None

    try:
        doc = fitz.open(str(pdf_path))
    except Exception:
        return None, None, None

    spans = []
    text_lines = []

    try:
        max_pages = min(1, len(doc))  # only FIRST PAGE to avoid heavy stuff
        for page_idx in range(max_pages):
            page = doc[page_idx]

            try:
                page_text = page.get_text("text")
                text_lines.extend(
                    [l.strip() for l in page_text.splitlines() if l.strip()]
                )
            except Exception:
                pass

            try:
                dict_text = page.get_text("dict")
                for block in dict_text.get("blocks", []):
                    for line in block.get("lines", []):
                        for span in line.get("spans", []):
                            txt = span.get("text", "").strip()
                            if not txt:
                                continue
                            size = span.get("size", 0.0)
                            bbox = span.get("bbox", [0, 0, 0, 0])
                            x0, y0, _, _ = bbox
                            spans.append(
                                {
                                    "page": page_idx,
                                    "x": x0,
                                    "y": y0,
                                    "size": size,
                                    "text": txt,
                                }
                            )
            except Exception:
                # If dict extraction fails, we still might have text_lines
                pass

    finally:
        doc.close()

    if not spans and not text_lines:
        return None, None, None

    title = extract_title_from_spans(spans) if spans else None
    publisher = extract_publisher_from_lines(text_lines)
    year = extract_year_from_lines(text_lines, publisher_line=publisher)

    return title, publisher, year



# APPLY TO ALL PDFs (SAFE)

new_titles = 0
new_pubs = 0
new_years = 0
missing_pdfs = 0
errors = []

for idx, row in tqdm(md.iterrows(), total=len(md), desc="Fixing from layout", unit="pdf"):
    doi_key = str(row["doi"]).strip()
    pdf_path = PDF_FOLDER / f"{doi_key}.pdf"

    if idx % 200 == 0:
        print(f"\n[INFO] At row {idx}, doi_key = {doi_key}")

    if not pdf_path.exists():
        missing_pdfs += 1
        continue

    try:
        title, publisher, year = extract_from_pdf_layout_safe(pdf_path)
    except Exception as e:
        errors.append((doi_key, str(e)[:200]))
        continue

    if title and isinstance(title, str):
        md.at[idx, "pdf_title"] = title
        new_titles += 1

    if publisher and isinstance(publisher, str):
        md.at[idx, "publication"] = publisher
        new_pubs += 1

    if year is not None:
        md.at[idx, "year"] = float(year)
        new_years += 1

print("\n SUMMARY")
print("Missing PDFs (not found on disk):", missing_pdfs)
print("Titles updated                   :", new_titles)
print("Publication (publisher) updated  :", new_pubs)
print("Years updated                    :", new_years)
print("Errors during extraction         :", len(errors))

if errors:
    print("\nSample errors:")
    for doi_key, msg in errors[:5]:
        print(f"  {doi_key}: {msg}")

md["year"] = pd.to_numeric(md["year"], errors="coerce").astype("float64")

md.to_csv(OUTPUT_METADATA_CSV, index=False)
print("\nSaved updated metadata with layout-based fields to:")
print(OUTPUT_METADATA_CSV)

print("\nPreview (first 10 rows):")
print(md[["doi", "pdf_title", "year", "publication"]].head(10).to_string(index=False))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded metadata: (3831, 11)


Fixing from layout:   0%|          | 2/3831 [00:00<05:59, 10.64pdf/s]


[INFO] At row 0, doi_key = 10_1002_14651858_CD005055_pub3


Fixing from layout:   3%|▎         | 108/3831 [00:57<16:20,  3.80pdf/s]

MuPDF error: format error: cmsOpenProfileFromMem failed



Fixing from layout:   5%|▌         | 205/3831 [01:07<02:11, 27.58pdf/s]


[INFO] At row 200, doi_key = 10_1038_modpathol_2014_79


Fixing from layout:  11%|█         | 405/3831 [01:26<02:42, 21.03pdf/s]


[INFO] At row 400, doi_key = 10_1038_s41598-020-57728-x


Fixing from layout:  16%|█▌        | 605/3831 [01:35<02:28, 21.69pdf/s]


[INFO] At row 600, doi_key = 10_1097_MPG_0000000000001792


Fixing from layout:  21%|██        | 803/3831 [03:16<07:38,  6.60pdf/s]


[INFO] At row 800, doi_key = 10_1136_bmj_e3054


Fixing from layout:  26%|██▌       | 1004/3831 [04:00<02:40, 17.66pdf/s]


[INFO] At row 1000, doi_key = 10_1136_gutjnl-2019-319630


Fixing from layout:  31%|███▏      | 1201/3831 [05:41<10:54,  4.02pdf/s]


[INFO] At row 1200, doi_key = 10_1136_thx_2006_068494


Fixing from layout:  33%|███▎      | 1265/3831 [05:55<08:19,  5.14pdf/s]

MuPDF error: format error: cmsOpenProfileFromMem failed



Fixing from layout:  37%|███▋      | 1400/3831 [06:37<01:47, 22.60pdf/s]


[INFO] At row 1400, doi_key = 10_1172_JCI119114
MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Fixing from layout:  37%|███▋      | 1404/3831 [06:37<01:46, 22.85pdf/s]

MuPDF error: format error: cmsOpenProfileFromMem failed



Fixing from layout:  37%|███▋      | 1407/3831 [06:37<01:40, 24.21pdf/s]

MuPDF error: format error: cmsOpenProfileFromMem failed



Fixing from layout:  37%|███▋      | 1411/3831 [06:37<01:33, 25.99pdf/s]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Fixing from layout:  42%|████▏     | 1605/3831 [06:46<01:20, 27.56pdf/s]


[INFO] At row 1600, doi_key = 10_1186_1471-2253-14-125


Fixing from layout:  47%|████▋     | 1804/3831 [06:54<02:23, 14.10pdf/s]


[INFO] At row 1800, doi_key = 10_1186_1741-7015-11-161


Fixing from layout:  52%|█████▏    | 2006/3831 [07:03<01:02, 29.06pdf/s]


[INFO] At row 2000, doi_key = 10_1186_cc2978


Fixing from layout:  58%|█████▊    | 2205/3831 [07:12<01:08, 23.88pdf/s]


[INFO] At row 2200, doi_key = 10_1186_s13000-020-01049-0


Fixing from layout:  63%|██████▎   | 2403/3831 [08:04<09:16,  2.57pdf/s]


[INFO] At row 2400, doi_key = 10_12688_f1000research_12239_1


Fixing from layout:  68%|██████▊   | 2604/3831 [08:15<00:55, 22.08pdf/s]


[INFO] At row 2600, doi_key = 10_1371_journal_pone_0053960


Fixing from layout:  73%|███████▎  | 2802/3831 [08:22<00:49, 20.91pdf/s]


[INFO] At row 2800, doi_key = 10_1371_journal_pone_0188494


Fixing from layout:  77%|███████▋  | 2937/3831 [08:29<00:58, 15.28pdf/s]

MuPDF error: format error: cmsOpenProfileFromMem failed



Fixing from layout:  78%|███████▊  | 3001/3831 [08:32<00:30, 26.90pdf/s]


[INFO] At row 3000, doi_key = 10_1590_S0102-76382010000400005


Fixing from layout:  82%|████████▏ | 3127/3831 [08:44<01:12,  9.71pdf/s]

MuPDF error: format error: cmsOpenProfileFromMem failed



Fixing from layout:  84%|████████▎ | 3203/3831 [08:48<00:26, 23.33pdf/s]


[INFO] At row 3200, doi_key = 10_3174_ajnr_A1768


Fixing from layout:  88%|████████▊ | 3380/3831 [09:02<00:17, 25.58pdf/s]

MuPDF error: format error: out of range code encountered in lzw decode

MuPDF error: library error: FT_New_Memory_Face(OTATET+HelveticaNeueLTStd-Lt): unknown file format



Fixing from layout:  89%|████████▉ | 3403/3831 [09:02<00:14, 29.28pdf/s]


[INFO] At row 3400, doi_key = 10_3389_fmed_2020_599434


Fixing from layout:  94%|█████████▍| 3600/3831 [09:18<01:47,  2.16pdf/s]


[INFO] At row 3600, doi_key = 10_4049_jimmunol_164_2_656


Fixing from layout:  99%|█████████▉| 3806/3831 [10:24<00:00, 27.77pdf/s]


[INFO] At row 3800, doi_key = 10_7164_antibiotics_34_1619


Fixing from layout: 100%|█████████▉| 3828/3831 [10:25<00:00, 22.75pdf/s]

MuPDF error: format error: cmsOpenProfileFromMem failed



Fixing from layout: 100%|██████████| 3831/3831 [10:26<00:00,  6.12pdf/s]


 SUMMARY
Missing PDFs (not found on disk): 0
Titles updated                   : 3644
Publication (publisher) updated  : 1929
Years updated                    : 3436
Errors during extraction         : 0

Saved updated metadata with layout-based fields to:
/content/drive/MyDrive/CapstoneProject/Capstone/metadata_new_layout_fixed.csv

Preview (first 10 rows):
                           doi                                                                       pdf_title   year                                                                                                                               publication
10_1002_14651858_CD005055_pub3                                                    HHS Vulnerability Disclosure    NaN                                                                                                                                       NaN
10_1002_14651858_CD006660_pub3                                                    HHS Vulnerability Disclosure    NaN           




In [18]:
# Enrich titles/year/journal/citations from DOI APIs


from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import requests
import time
import re
from pathlib import Path
from tqdm import tqdm


# CONFIG

BASE_DIR = Path("/content/drive/MyDrive/CapstoneProject/Capstone")

# Your current metadata file (with layout-based titles)
INPUT_METADATA_CSV  = BASE_DIR / "metadata_new_layout_fixed.csv"
OUTPUT_METADATA_CSV = BASE_DIR / "metadata_enriched_from_apis.csv"

# Original download logs (ground truth DOIs)
LIVER_SOURCE  = BASE_DIR / "download_log_liver_transplant.csv"
LUNG_SOURCE   = BASE_DIR / "download_log_lung_transplant.csv"
HEART_SOURCE  = BASE_DIR / "download_log_heart_transplant.csv"
KIDNEY_SOURCE = BASE_DIR / "download_log_kidney_transplant.csv"

# polite user agent for APIs
YOUR_EMAIL = "tn2463@nyu.edu"

# 1) LOAD METADATA

md = pd.read_csv(INPUT_METADATA_CSV)
print("Loaded metadata:", md.shape)

if "doi" not in md.columns:
    raise ValueError("Metadata must contain a 'doi' column (underscore-style keys).")

md["doi"] = md["doi"].astype(str).str.strip().str.lower()


# 2) REBUILD KEY → CANONICAL DOI FROM LOGS

def parse_source_file(content: str):
    """
    Your logs look like: '10.1002/hep.30368success10.1002/hep.30370failed...'
    We pull (doi, status) pairs.
    """
    pattern = r"(10\.\S+?)(success|failed)"
    return re.findall(pattern, content, re.IGNORECASE)

def doi_to_key(doi: str) -> str:
    """
    Recreate the filename-style key used for PDFs/metadata.
    Example:
      DOI  '10.1002/hep.30368'   -> '10_1002_hep_30368'
      DOI  '10.1186/1471-2482-8-2' -> '10_1186_1471-2482-8-2'
    i.e. replace '/' and '.' with '_' then lowercase.
    """
    s = str(doi).strip().rstrip(",").lower()
    s = s.replace("/", "_").replace(".", "_")
    return s

source_files = {
    "liver":  LIVER_SOURCE,
    "lung":   LUNG_SOURCE,
    "heart":  HEART_SOURCE,
    "kidney": KIDNEY_SOURCE,
}

key_to_doi = {}
organ_stats = {k: 0 for k in source_files.keys()}

print("\nRebuilding DOI key → canonical DOI mapping from logs...")

for organ, path in source_files.items():
    try:
        content = path.read_text(encoding="utf-8", errors="ignore")
        doi_status_pairs = parse_source_file(content)

        success_count = 0
        for doi, status in doi_status_pairs:
            if status.lower() == "success":
                key = doi_to_key(doi)
                # last one wins if duplicates, that's fine
                key_to_doi[key] = doi.strip().rstrip(",")
                success_count += 1

        organ_stats[organ] = success_count
        print(f"  {organ:10}: {success_count:5d} successful DOIs parsed")
    except FileNotFoundError:
        print(f"  {organ:10}: LOG FILE MISSING at {path}")
    except Exception as e:
        print(f"  {organ:10}: Error reading {path}: {e}")

print(f"\nTotal unique keys in mapping: {len(key_to_doi)}")

# Attach canonical_doi to metadata
md["canonical_doi"] = md["doi"].map(key_to_doi)
known_canonical = md["canonical_doi"].notna().sum()
print(f"Rows with known canonical DOI from logs: {known_canonical}/{len(md)}")


# 3) CROSSREF & OPENALEX HELPERS

CR_BASE = "https://api.crossref.org/works/"
OA_BASE = "https://api.openalex.org/works/doi:"

CR_HEADERS = {"User-Agent": f"taruni-metadata-bot/1.0 (mailto:{YOUR_EMAIL})"}
OA_HEADERS = {"User-Agent": f"taruni-metadata-bot/1.0 (mailto:{YOUR_EMAIL})"}

def fetch_crossref(doi: str):
    url = CR_BASE + doi
    try:
        r = requests.get(url, headers=CR_HEADERS, timeout=10)
        if r.status_code != 200:
            return None
        return r.json().get("message", {})
    except Exception:
        return None

def fetch_openalex(doi: str):
    url = OA_BASE + doi
    try:
        r = requests.get(url, headers=OA_HEADERS, timeout=10)
        if r.status_code != 200:
            return None
        return r.json()
    except Exception:
        return None

def parse_cr_metadata(msg: dict):
    """
    Extract title, journal, year, citations from Crossref 'message'.
    Returns (title, journal, year, citations)
    """
    if not msg:
        return None, None, None, None

    # title
    title = None
    tlist = msg.get("title")
    if isinstance(tlist, list) and tlist:
        title = tlist[0].strip() or None

    # journal/container
    journal = None
    clist = msg.get("container-title")
    if isinstance(clist, list) and clist:
        journal = clist[0].strip() or None

    # year
    year = None
    def pull_year(field):
        d = msg.get(field)
        if isinstance(d, dict):
            parts = d.get("date-parts")
            if isinstance(parts, list) and parts and parts[0]:
                return parts[0][0]
        return None

    year = pull_year("published-print") or pull_year("published-online") or pull_year("issued")
    if isinstance(year, list):
        year = year[0]
    if isinstance(year, str):
        try:
            year = int(year)
        except:
            year = None

    # citations
    cites = msg.get("is-referenced-by-count")
    if isinstance(cites, str):
        try:
            cites = int(cites)
        except:
            cites = None

    return title, journal, year, cites

def parse_oa_metadata(obj: dict):
    """
    Extract title, journal, year, citations from OpenAlex object.
    Returns (title, journal, year, citations)
    """
    if not obj:
        return None, None, None, None

    title = obj.get("title") or None
    journal = None
    loc = obj.get("primary_location") or {}
    src = loc.get("source") or {}
    if src:
        journal = src.get("display_name") or None

    year = obj.get("publication_year") or None
    cites = obj.get("cited_by_count") or None

    return title, journal, year, cites



# 4) ENRICH METADATA FROM APIS

md["metadata_source"] = md.get("metadata_source", pd.Series(["original"] * len(md)))

updated_title = 0
updated_pub   = 0
updated_year  = 0
updated_cites = 0
api_calls_cr  = 0
api_calls_oa  = 0
errors        = []

print("\nEnriching metadata from Crossref + OpenAlex ...")
print("This may take a while; we only hit rows with canonical_doi.")

for idx, row in tqdm(md.iterrows(), total=len(md), desc="DOI enrichment", unit="paper"):
    canon = row["canonical_doi"]
    if not isinstance(canon, str) or not canon.strip():
        continue  # no DOI -> skip

    canon = canon.strip()

    # Crossref
    cr = fetch_crossref(canon)
    time.sleep(0.15)   # be nice to API
    api_calls_cr += 1

    cr_title, cr_journal, cr_year, cr_cites = parse_cr_metadata(cr)

    # OpenAlex (only if we still miss stuff or want citations)
    oa_title = oa_journal = oa_year = oa_cites = None
    # Only call OpenAlex if we still need citations OR year OR title/journal
    need_oa = (cr_cites is None) or (cr_year is None) or (cr_title is None) or (cr_journal is None)
    if need_oa:
        oa = fetch_openalex(canon)
        time.sleep(0.15)
        api_calls_oa += 1
        oa_title, oa_journal, oa_year, oa_cites = parse_oa_metadata(oa)

    # ---------------- MERGE LOGIC ----------------
    # TITLE: prefer Crossref > OpenAlex > existing
    new_title = cr_title or oa_title
    if new_title and isinstance(new_title, str):
        if new_title != row.get("pdf_title", ""):
            md.at[idx, "pdf_title"] = new_title
            updated_title += 1
        src = md.at[idx, "metadata_source"]
        if "crossref" not in src and cr_title:
            md.at[idx, "metadata_source"] = (src + "+crossref").strip("+")
        elif "openalex" not in src and oa_title and not cr_title:
            md.at[idx, "metadata_source"] = (src + "+openalex").strip("+")

    # PUBLICATION/JOURNAL
    new_pub = cr_journal or oa_journal
    if new_pub and isinstance(new_pub, str):
        if pd.isna(row.get("publication")) or not str(row.get("publication")).strip():
            md.at[idx, "publication"] = new_pub
            updated_pub += 1

    # YEAR
    new_year = cr_year or oa_year
    if new_year:
        try:
            new_year = int(new_year)
            if new_year >= 1950 and new_year <= 2030:
                old_year = row.get("year")
                if pd.isna(old_year) or int(old_year) != new_year:
                    md.at[idx, "year"] = float(new_year)
                    updated_year += 1
        except Exception:
            pass

    # CITATIONS
    new_cites = cr_cites if cr_cites is not None else oa_cites
    if new_cites is not None:
        try:
            new_cites = int(new_cites)
            old_cites = row.get("citation_count")
            if pd.isna(old_cites) or int(old_cites) != new_cites:
                md.at[idx, "citation_count"] = float(new_cites)
                updated_cites += 1
        except Exception:
            pass

print("\nSUMMARY")
print(f"Rows with canonical DOI            : {known_canonical}")
print(f"Crossref API calls                 : {api_calls_cr}")
print(f"OpenAlex API calls                 : {api_calls_oa}")
print(f"Titles updated from APIs           : {updated_title}")
print(f"Publication/journal updated        : {updated_pub}")
print(f"Years updated                      : {updated_year}")
print(f"Citation counts updated            : {updated_cites}")

# Ensure citation_count stays numeric too
md["citation_count"] = pd.to_numeric(md["citation_count"], errors="coerce")

# Convert year to proper integer type
md["year"] = pd.to_numeric(md["year"], errors="coerce").astype("Int64")

md.to_csv(OUTPUT_METADATA_CSV, index=False)
print("Saved with integer year:", OUTPUT_METADATA_CSV)


md.to_csv(OUTPUT_METADATA_CSV, index=False)
print("\nSaved enriched metadata to:")
print(OUTPUT_METADATA_CSV)

print("\nPreview of enriched rows:")
print(md[["doi", "canonical_doi", "pdf_title", "year", "publication", "citation_count", "metadata_source"]].head(12).to_string(index=False))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded metadata: (3831, 11)

Rebuilding DOI key → canonical DOI mapping from logs...
  liver     :  5649 successful DOIs parsed
  lung      :  1185 successful DOIs parsed
  heart     :   965 successful DOIs parsed
  kidney    :  2264 successful DOIs parsed

Total unique keys in mapping: 3829
Rows with known canonical DOI from logs: 3819/3831

Enriching metadata from Crossref + OpenAlex ...
This may take a while; we only hit rows with canonical_doi.


DOI enrichment: 100%|██████████| 3831/3831 [12:11<00:00,  5.24paper/s]



SUMMARY
Rows with canonical DOI            : 3819
Crossref API calls                 : 3819
OpenAlex API calls                 : 29
Titles updated from APIs           : 3678
Publication/journal updated        : 488
Years updated                      : 970
Citation counts updated            : 3815
Saved with integer year: /content/drive/MyDrive/CapstoneProject/Capstone/metadata_enriched_from_apis.csv

Saved enriched metadata to:
/content/drive/MyDrive/CapstoneProject/Capstone/metadata_enriched_from_apis.csv

Preview of enriched rows:
                           doi                  canonical_doi                                                                                                                                                                                                                                                               pdf_title  year                                                                                                                               pu

In [20]:
# "SCHOLAR" (SEMANTIC SCHOLAR) ENRICHMENT + CLEANUP

from google.colab import drive
drive.mount("/content/drive", force_remount=False)

import pandas as pd
import requests
import re
import time
from pathlib import Path
from tqdm import tqdm

# CONFIG

BASE_DIR = Path("/content/drive/MyDrive/CapstoneProject/Capstone")

# Use your latest enriched file here
INPUT_CSV  = BASE_DIR / "metadata_enriched_from_apis.csv"
OUTPUT_CSV = BASE_DIR / "metadata_enriched_scholar.csv"

print("Loading metadata...")
md = pd.read_csv(INPUT_CSV)
print("Shape:", md.shape)

required_cols = ["doi", "canonical_doi", "pdf_title", "publication", "year", "citation_count"]
for col in required_cols:
    if col not in md.columns:
        raise ValueError(f"Expected column '{col}' not found in metadata!")

# HELPER FUNCTIONS: detect bad metadata

def is_bad_publication(pub):
    if not isinstance(pub, str):
        return True
    s = pub.strip().lower()
    if not s:
        return True

    # obvious junk / affiliation / portal text
    bad_substrings = [
        "division of",
        "department of",
        "school of",
        "university",
        "hospital",
        "research portal",
        "research explorer",
        "king's research",
        "edinburgh research",
        "portal are retained by the authors",
        "copyright and moral rights",
        "copyright ©",
        "viewing from",
        "preparing to download",
        "request unsuccessful",
        "incapsula incident",
        "zora (zurich open repository",
        "this article is protected by copyright",
        "all rights reserved"
    ]

    if any(bad in s for bad in bad_substrings):
        return True

    # if it has no letters at all
    if not re.search(r"[a-zA-Z]", s):
        return True

    # extremely long = probably a sentence, not journal name
    if len(s) > 200:
        return True

    return False


def is_bad_title(title):
    if not isinstance(title, str):
        return True
    s = title.strip()
    if not s:
        return True

    lower = s.lower()

    # obvious garbage / placeholders
    bad_exact = {
        "unknown",
        "preparing to download ...",
        "publisher's pdf, also known as version of record"
    }
    if lower in bad_exact:
        return True

    # looks like affiliation / generic header
    if "division of" in lower or "department of" in lower or "university" in lower:
        return True

    # too short / too long
    if len(s) < 15 or len(s) > 350:
        return True

    return False


def strip_html(text):
    if not isinstance(text, str):
        return text
    # remove HTML tags
    text = re.sub(r"<[^>]+>", "", text)
    # collapse whitespace
    text = re.sub(r"\s+", " ", text)
    return text.strip()



# DECIDE WHICH ROWS TO TOUCH

md["canonical_doi"] = md["canonical_doi"].astype(str)
has_canon = md["canonical_doi"].notna() & md["canonical_doi"].str.strip().ne("nan")

bad_pub_mask   = md["publication"].apply(is_bad_publication)
bad_title_mask = md["pdf_title"].apply(is_bad_title)
missing_cites  = md["citation_count"].isna()

to_fix_mask = has_canon & (bad_pub_mask | bad_title_mask | missing_cites)
rows_to_fix = md[to_fix_mask].copy()

print("\nROW STATS")
print("---------")
print(f"Total rows:               {len(md)}")
print(f"Rows with canonical DOI:  {has_canon.sum()}")
print(f"Rows with bad publication:{bad_pub_mask.sum()}")
print(f"Rows with bad title:      {bad_title_mask.sum()}")
print(f"Rows missing citations:   {missing_cites.sum()}")
print(f"Rows to hit via SemScholar: {rows_to_fix.shape[0]}")

# SEMANTIC SCHOLAR API

# Docs: https://api.semanticscholar.org
# We'll use the public Graph API: /graph/v1/paper/DOI:<doi>?fields=...

SS_BASE = "https://api.semanticscholar.org/graph/v1/paper/DOI:"
SS_FIELDS = "title,year,venue,citationCount"

SS_HEADERS = {
    "User-Agent": "taruni-metadata-bot/1.0 (semantic-scholar enrichment)",
}

def fetch_semantic_scholar(doi):
    url = f"{SS_BASE}{doi}"
    params = {"fields": SS_FIELDS}
    try:
        r = requests.get(url, headers=SS_HEADERS, params=params, timeout=10)
        if r.status_code != 200:
            return None
        return r.json()
    except Exception:
        return None

def parse_ss_meta(obj):
    if not obj:
        return None, None, None, None
    title = obj.get("title") or None
    year = obj.get("year") or None
    venue = obj.get("venue") or None
    cites = obj.get("citationCount") or None
    return title, venue, year, cites

# APPLY ENRICHMENT

if "metadata_source" not in md.columns:
    md["metadata_source"] = "original"

updated_title = 0
updated_pub   = 0
updated_year  = 0
updated_cites = 0

api_calls = 0

print("\nEnriching from Semantic Scholar (targeted)...")

for idx in tqdm(rows_to_fix.index, desc="SemScholar", unit="row"):
    canon = md.at[idx, "canonical_doi"]
    if not isinstance(canon, str) or not canon.strip() or canon.strip().lower() == "nan":
        continue
    canon = canon.strip()

    ss_obj = fetch_semantic_scholar(canon)
    api_calls += 1
    time.sleep(0.15)  # chill a bit to respect rate limits

    ss_title, ss_venue, ss_year, ss_cites = parse_ss_meta(ss_obj)

    # TITLE
    old_title = md.at[idx, "pdf_title"]
    new_title = old_title

    if is_bad_title(old_title) and ss_title:
        new_title = ss_title
    elif ss_title and len(ss_title) > 15 and len(ss_title) < 350:
        # if existing title looks kind of meh, prefer SemScholar one
        if is_bad_title(old_title):
            new_title = ss_title

    if isinstance(new_title, str):
        new_title_clean = strip_html(new_title)
    else:
        new_title_clean = new_title

    if isinstance(new_title_clean, str) and new_title_clean and new_title_clean != old_title:
        md.at[idx, "pdf_title"] = new_title_clean
        updated_title += 1
        if "semanticscholar" not in str(md.at[idx, "metadata_source"]).lower():
            md.at[idx, "metadata_source"] = str(md.at[idx, "metadata_source"]) + "+semanticscholar"

    # PUBLICATION / VENUE
    old_pub = md.at[idx, "publication"]
    if (is_bad_publication(old_pub) or pd.isna(old_pub)) and isinstance(ss_venue, str) and ss_venue.strip():
        md.at[idx, "publication"] = ss_venue.strip()
        updated_pub += 1

    # YEAR
    old_year = md.at[idx, "year"]
    if ss_year:
        try:
            ss_year_int = int(ss_year)
            if 1950 <= ss_year_int <= 2030:
                if pd.isna(old_year) or int(old_year) != ss_year_int:
                    md.at[idx, "year"] = ss_year_int
                    updated_year += 1
        except Exception:
            pass

    # CITATIONS
    old_cites = md.at[idx, "citation_count"]
    if ss_cites is not None:
        try:
            ss_cites_int = int(ss_cites)
            if ss_cites_int >= 0 and (pd.isna(old_cites) or int(old_cites) != ss_cites_int):
                md.at[idx, "citation_count"] = ss_cites_int
                updated_cites += 1
        except Exception:
            pass

print("\nSEMANTIC SCHOLAR SUMMARY")
print(f"API calls                : {api_calls}")
print(f"Titles updated           : {updated_title}")
print(f"Publication updated      : {updated_pub}")
print(f"Year updated             : {updated_year}")
print(f"Citation count updated   : {updated_cites}")

# Enforce numeric dtypes
md["year"] = pd.to_numeric(md["year"], errors="coerce").astype("Int64")
md["citation_count"] = pd.to_numeric(md["citation_count"], errors="coerce").astype("Int64")


md.to_csv(OUTPUT_CSV, index=False)
print("\nSaved enriched metadata to:", OUTPUT_CSV)

# BEFORE/AFTER DEMO ON A FEW TOUCHED ROWS

cols_show = ["doi", "canonical_doi", "pdf_title", "year", "publication", "citation_count"]
sample_idx = list(rows_to_fix.index[:10])

print("\nBEFORE (from input file)")
md_before = pd.read_csv(INPUT_CSV).loc[sample_idx, cols_show]
print(md_before.to_string(index=False))

print("\nAFTER (Semantic Scholar)")
print(md.loc[sample_idx, cols_show].to_string(index=False))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading metadata...
Shape: (3831, 13)

ROW STATS
---------
Total rows:               3831
Rows with canonical DOI:  3819
Rows with bad publication:348
Rows with bad title:      17
Rows missing citations:   2
Rows to hit via SemScholar: 357

Enriching from Semantic Scholar (targeted)...


SemScholar: 100%|██████████| 357/357 [01:30<00:00,  3.93row/s]


SEMANTIC SCHOLAR SUMMARY
API calls                : 357
Titles updated           : 8
Publication updated      : 0
Year updated             : 0
Citation count updated   : 0

Saved enriched metadata to: /content/drive/MyDrive/CapstoneProject/Capstone/metadata_enriched_scholar.csv

BEFORE (from input file)
                           doi                  canonical_doi                                                                                                                                                       pdf_title   year                                                                                                                               publication  citation_count
10_1002_14651858_cd010985_pub4 10.1002/14651858.CD010985.pub4                                                                          Interventions for treating intrahepatic cholestasis in people with sickle cell disease 2020.0                                                         Copyright © 2020 The Cochr




In [21]:
# GLOBAL SEMANTIC SCHOLAR ENRICHMENT + PUBLICATION CLEANUP + CATEGORY FIX

from google.colab import drive
drive.mount("/content/drive", force_remount=False)

import pandas as pd
import requests
import re
import time
from pathlib import Path
from tqdm import tqdm

# CONFIG
BASE_DIR   = Path("/content/drive/MyDrive/CapstoneProject/Capstone")
INPUT_CSV  = BASE_DIR / "metadata_enriched_scholar.csv"   # latest file you created
OUTPUT_CSV = BASE_DIR / "metadata_enriched_scholar_full.csv"

print("Loading metadata")
md = pd.read_csv(INPUT_CSV)
print("Shape:", md.shape)

# Ensure expected columns exist
expected_cols = ["doi", "canonical_doi", "pdf_title", "publication", "year", "citation_count", "category"]
for col in expected_cols:
    if col not in md.columns:
        raise ValueError(f"Expected column '{col}' not found in metadata!")



# HELPERS: VALIDATION + CLEANUP

def is_bad_publication(pub: str) -> bool:
    """
    Decide whether 'publication' looks like junk (affiliations, portal clutter, etc.)
    """
    if not isinstance(pub, str):
        return True

    s = pub.strip()
    if not s:
        return True

    lower = s.lower()

    bad_substrings = [
        "division of",
        "department of",
        "school of",
        "faculty of",
        "university",
        "hospital",
        "research portal",
        "research explorer",
        "king's research",
        "edinburgh research",
        "portal are retained by the authors",
        "copyright and moral rights",
        "copyright ©",
        "all rights reserved",
        "preparing to download",
        "viewing from:",
        "request unsuccessful",
        "incapsula incident",
        "zora (zurich open repository",
        "this article is protected by copyright",
        "this is the published version"
    ]

    if any(bad in lower for bad in bad_substrings):
        return True

    # no letters at all → junk
    if not re.search(r"[a-zA-Z]", s):
        return True

    # extremely long is probably a sentence, not a journal name
    if len(s) > 200:
        return True

    return False


def is_bad_title(title: str) -> bool:
    if not isinstance(title, str):
        return True

    s = title.strip()
    if not s:
        return True

    lower = s.lower()

    bad_exact = {
        "unknown",
        "preparing to download ...",
        "publisher's pdf, also known as version of record",
    }
    if lower in bad_exact:
        return True

    if "division of" in lower or "department of" in lower or "university" in lower:
        return True

    if len(s) < 15 or len(s) > 400:
        return True

    return False


def strip_html(text: str):
    if not isinstance(text, str):
        return text
    text = re.sub(r"<[^>]+>", "", text)           # remove HTML tags
    text = re.sub(r"\s+", " ", text)             # collapse whitespace
    return text.strip()


def clean_publication_name(pub: str):
    """
    Clean up the publication string; if it's garbage, return None.
    """
    if not isinstance(pub, str):
        return None

    s = strip_html(pub)

    # Remove leading/trailing punctuation
    s = re.sub(r"^[\s\.,;:\-\|]+", "", s)
    s = re.sub(r"[\s\.,;:\-\|]+$", "", s)

    # If now it looks bad, mark as None
    if is_bad_publication(s):
        return None

    return s

# SEMANTIC SCHOLAR API (FOR ALL CANONICAL DOIs)

SS_BASE   = "https://api.semanticscholar.org/graph/v1/paper/DOI:"
SS_FIELDS = "title,year,venue,citationCount"
SS_HEADERS = {
    "User-Agent": "taruni-metadata-bot/1.0 (semantic-scholar enrichment)",
}

def fetch_semantic_scholar(doi: str):
    url = f"{SS_BASE}{doi}"
    params = {"fields": SS_FIELDS}
    try:
        r = requests.get(url, headers=SS_HEADERS, params=params, timeout=10)
        if r.status_code != 200:
            return None
        return r.json()
    except Exception:
        return None

def parse_ss_meta(obj):
    if not obj:
        return None, None, None, None
    title = obj.get("title") or None
    year  = obj.get("year") or None
    venue = obj.get("venue") or None
    cites = obj.get("citationCount") or None
    return title, venue, year, cites


# BUILD MASK OF ROWS WE CAN HIT (ALL WITH CANONICAL DOI)

md["canonical_doi"] = md["canonical_doi"].astype(str)
has_canon = md["canonical_doi"].notna() & md["canonical_doi"].str.strip().ne("nan")

rows_to_hit = md[has_canon].copy()
print("\nSEMANTIC SCHOLAR GLOBAL ENRICHMENT")
print(f"Total rows:              {len(md)}")
print(f"Rows with canonical DOI: {has_canon.sum()} (will call S2 on all of these)")

if "metadata_source" not in md.columns:
    md["metadata_source"] = "original"

updated_title = 0
updated_pub   = 0
updated_year  = 0
updated_cites = 0
api_calls     = 0

print("\nCalling Semantic Scholar for all canonical DOIs...")
for idx in tqdm(rows_to_hit.index, desc="SemanticScholar (full)", unit="paper"):
    canon = md.at[idx, "canonical_doi"]
    if not isinstance(canon, str) or not canon.strip() or canon.strip().lower() == "nan":
        continue

    canon = canon.strip()
    ss_obj = fetch_semantic_scholar(canon)
    api_calls += 1
    time.sleep(0.15)  # throttle

    ss_title, ss_venue, ss_year, ss_cites = parse_ss_meta(ss_obj)

    # TITLE
    old_title = md.at[idx, "pdf_title"]
    new_title = old_title

    if ss_title:
        ss_title_clean = strip_html(ss_title)
        if is_bad_title(old_title) and not is_bad_title(ss_title_clean):
            new_title = ss_title_clean
        else:
            # if old title looks kinda meh, we can still prefer S2
            if not is_bad_title(ss_title_clean) and len(ss_title_clean) >= len(str(old_title) or "") / 2:
                new_title = ss_title_clean

    if isinstance(new_title, str) and new_title.strip() and new_title != old_title:
        md.at[idx, "pdf_title"] = new_title.strip()
        updated_title += 1
        src = str(md.at[idx, "metadata_source"])
        if "semanticscholar" not in src.lower():
            md.at[idx, "metadata_source"] = src + "+semanticscholar_full"

    # PUBLICATION
    old_pub = md.at[idx, "publication"]
    cleaned_old = clean_publication_name(old_pub) if isinstance(old_pub, str) else None

    final_pub = cleaned_old
    if isinstance(ss_venue, str) and ss_venue.strip():
        ss_venue_clean = clean_publication_name(ss_venue)
        # prefer Semantic Scholar venue if our current pub is junk or missing
        if ss_venue_clean and (final_pub is None or is_bad_publication(old_pub)):
            final_pub = ss_venue_clean

    if final_pub is None:
        # keep as NaN/None
        if not pd.isna(old_pub):
            md.at[idx, "publication"] = None
            updated_pub += 1
    else:
        if old_pub != final_pub:
            md.at[idx, "publication"] = final_pub
            updated_pub += 1

    # YEAR
    old_year = md.at[idx, "year"]
    if ss_year:
        try:
            ss_year_int = int(ss_year)
            if 1900 <= ss_year_int <= 2035:
                if pd.isna(old_year) or int(old_year) != ss_year_int:
                    md.at[idx, "year"] = ss_year_int
                    updated_year += 1
        except Exception:
            pass

    # CITATIONS
    old_cites = md.at[idx, "citation_count"]
    if ss_cites is not None:
        try:
            ss_cites_int = int(ss_cites)
            if ss_cites_int >= 0 and (pd.isna(old_cites) or int(old_cites) != ss_cites_int):
                md.at[idx, "citation_count"] = ss_cites_int
                updated_cites += 1
        except Exception:
            pass

print("\nSEMANTIC SCHOLAR (FULL) SUMMARY")

print(f"API calls              : {api_calls}")
print(f"Titles updated         : {updated_title}")
print(f"Publication updated    : {updated_pub}")
print(f"Year updated           : {updated_year}")
print(f"Citation count updated : {updated_cites}")


# CATEGORY FIX FOR UNASSIGNED (USING TITLE + PUBLICATION KEYWORDS)

print("\nCATEGORY FIX FOR UNASSIGNED")

if "category" not in md.columns:
    md["category"] = "unassigned"

unassigned_mask = (md["category"].isna()) | (md["category"].str.lower() == "unassigned")
print(f"Unassigned rows before classification: {unassigned_mask.sum()}")

# keyword sets
HEART_KWS = [
    "heart", "cardiac", "cardio", "myocard", "coronary", "ventricular", "atrial",
    "cardiomyopathy", "cardiovascular", "heart failure", "ischemic", "myocarditis"
]
LUNG_KWS = [
    "lung", "pulmon", "respiratory", "bronch", "copd", "asthma", "pneumon", "airway",
    "ventilator", "ventilation", "emphysema", "cystic fibrosis", "interstitial lung"
]
LIVER_KWS = [
    "liver", "hepatic", "hepat", "cirrhosis", "cholest", "biliary", "portal vein",
    "steatohepatitis", "nash", "transaminase", "bilirubin", "ascites"
]
KIDNEY_KWS = [
    "kidney", "renal", "nephro", "glomer", "dialysis", "uremia", "hemodialysis",
    "proteinuria", "hematuria", "ckd", "chronic kidney"
]

def classify_category_from_text(text: str):
    if not isinstance(text, str):
        return None
    s = text.lower()

    scores = {"heart": 0, "lung": 0, "liver": 0, "kidney": 0}

    for kw in HEART_KWS:
        if kw in s:
            scores["heart"] += 1
    for kw in LUNG_KWS:
        if kw in s:
            scores["lung"] += 1
    for kw in LIVER_KWS:
        if kw in s:
            scores["liver"] += 1
    for kw in KIDNEY_KWS:
        if kw in s:
            scores["kidney"] += 1

    best_cat = max(scores, key=scores.get)
    best_score = scores[best_cat]

    # require at least 1 keyword and a unique winner
    if best_score == 0:
        return None
    # check ties
    if list(scores.values()).count(best_score) > 1:
        return None

    return best_cat

new_cats = 0

for idx in md[unassigned_mask].index:
    title = md.at[idx, "pdf_title"]
    pub   = md.at[idx, "publication"]
    text_for_cat = " ".join([str(title or ""), str(pub or "")])
    cat = classify_category_from_text(text_for_cat)
    if cat is not None:
        md.at[idx, "category"] = cat
        new_cats += 1

unassigned_after = (md["category"].isna()) | (md["category"].str.lower() == "unassigned")
print(f"Newly classified unassigned rows: {new_cats}")
print(f"Unassigned rows after classification: {unassigned_after.sum()}")

# FINAL TYPE CLEANUP + SAVE

md["year"] = pd.to_numeric(md["year"], errors="coerce").astype("Int64")
md["citation_count"] = pd.to_numeric(md["citation_count"], errors="coerce").astype("Int64")

md.to_csv(OUTPUT_CSV, index=False)
print(f"\nSaved fully enriched + cleaned metadata to:\n{OUTPUT_CSV}")

print("\nQuick check:")
print(md[["doi", "canonical_doi", "pdf_title", "year", "publication", "citation_count", "category"]].head(10).to_string(index=False))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading metadata
Shape: (3831, 13)

SEMANTIC SCHOLAR GLOBAL ENRICHMENT
Total rows:              3831
Rows with canonical DOI: 3819 (will call S2 on all of these)

Calling Semantic Scholar for all canonical DOIs...


SemanticScholar (full): 100%|██████████| 3819/3819 [16:03<00:00,  3.96paper/s]


SEMANTIC SCHOLAR (FULL) SUMMARY
API calls              : 3819
Titles updated         : 58
Publication updated    : 1024
Year updated           : 31
Citation count updated : 225

CATEGORY FIX FOR UNASSIGNED
Unassigned rows before classification: 85
Newly classified unassigned rows: 29
Unassigned rows after classification: 56

Saved fully enriched + cleaned metadata to:
/content/drive/MyDrive/CapstoneProject/Capstone/metadata_enriched_scholar_full.csv

Quick check:
                           doi                  canonical_doi                                                                                                                                                                                                                                                               pdf_title  year                             publication  citation_count   category
10_1002_14651858_cd005055_pub3 10.1002/14651858.CD005055.pub3                                                                       


