In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install Dependencies

!pip install pypdfium2 PyMuPDF pandas tqdm requests

In [None]:
# CONFIGURATION

PDF_FOLDER = "/content/drive/MyDrive/CapstoneProject/Capstone/papers"
TEXT_OUTPUT = "/content/drive/MyDrive/CapstoneProject/Capstone/text_extraction"
METADATA_CSV = "/content/drive/MyDrive/CapstoneProject/Capstone/metadata_new.csv"

# Source files for category assignment
LIVER_SOURCE = "/content/drive/MyDrive/CapstoneProject/Capstone/download_log_liver_transplant.csv"
LUNG_SOURCE = "/content/drive/MyDrive/CapstoneProject/Capstone/download_log_lung_transplant.csv"
HEART_SOURCE = "/content/drive/MyDrive/CapstoneProject/Capstone/download_log_heart_transplant.csv"
KIDNEY_SOURCE = "/content/drive/MyDrive/CapstoneProject/Capstone/download_log_kidney_transplant.csv"

# Your email for API requests
YOUR_EMAIL = "tn2463@nyu.edu"

In [None]:
from pathlib import Path

Path(TEXT_OUTPUT).mkdir(parents=True, exist_ok=True)
print(f"Output folder ready: {TEXT_OUTPUT}")

In [None]:
import pypdfium2 as pdfium
from pathlib import Path
from tqdm import tqdm
import time
import pandas as pd

all_pdfs = list(Path(PDF_FOLDER).glob('**/*.pdf'))

print("TEXT EXTRACTION FROM PDFs")
print(f"Total PDFs: {len(all_pdfs)}\n")

def extract_doi_from_filename(filename):
    name = filename.replace('.pdf', '')
    doi = name.replace('_', '/')
    return doi

def extract_title_from_text(text):
    lines = text.strip().split('\n')
    for line in lines[:10]:
        line = line.strip()
        if len(line) > 20 and len(line) < 200:
            return line
    return "Unknown"

metadata_records = []
start_time = time.time()

for idx, pdf_path in enumerate(tqdm(all_pdfs, desc="Extracting", unit="pdf")):
    doi = extract_doi_from_filename(pdf_path.name)
    txt_file = Path(TEXT_OUTPUT) / f"{doi.replace('/', '_')}.txt"

    record = {
        'pdf_title': 'Unknown',
        'doi': doi,
        'file_size_mb': round(pdf_path.stat().st_size / (1024 * 1024), 2),
        'text_length': 0,
        'is_scanned': False,
        'needs_ocr': False,
        'extraction_method': 'none',
        'year': None,
        'citation_count': None,
        'publication': None,
        'category': 'other'
    }

    if txt_file.exists():
        with open(txt_file, 'r', encoding='utf-8') as f:
            text = f.read()
        record['text_length'] = len(text.strip())
        record['pdf_title'] = extract_title_from_text(text)
        if record['text_length'] < 500:
            record['is_scanned'] = True
            record['needs_ocr'] = True
            record['extraction_method'] = 'ocr'
        else:
            record['extraction_method'] = 'text_extraction'
    else:
        try:
            pdf = pdfium.PdfDocument(pdf_path)
            text = ""
            for i in range(len(pdf)):
                page = pdf[i]
                textpage = page.get_textpage()
                page_text = textpage.get_text_range()
                if page_text:
                    text += page_text + "\n\n"
            pdf.close()

            with open(txt_file, "w", encoding="utf-8") as f:
                f.write(text)

            record['text_length'] = len(text.strip())
            record['pdf_title'] = extract_title_from_text(text)

            if record['text_length'] < 500:
                record['is_scanned'] = True
                record['needs_ocr'] = True
                record['extraction_method'] = 'ocr'
            else:
                record['extraction_method'] = 'text_extraction'

        except Exception as e:
            record['extraction_method'] = 'failed'
            record['needs_ocr'] = True
            record['is_scanned'] = True

    metadata_records.append(record)

metadata_df = pd.DataFrame(metadata_records)
metadata_df.to_csv(METADATA_CSV, index=False)

elapsed_time = time.time() - start_time
needs_ocr_count = len(metadata_df[metadata_df['needs_ocr'] == True])

print("EXTRACTION COMPLETE")
print(f"Total PDFs: {len(metadata_df)}")
print(f"Need OCR: {needs_ocr_count}")
print(f"Time: {elapsed_time/60:.1f} minutes")
print(f"\nExtraction method distribution:")
print(metadata_df['extraction_method'].value_counts())

In [None]:
# RE-EXTRACT FAILED PDFS WITH PYMUPDF

import fitz  # PyMuPDF

df = pd.read_csv(METADATA_CSV)
missing_df = df[df['extraction_method'] == 'failed'].copy()

print("RE-EXTRACTING FAILED PDFs WITH PyMuPDF")
print(f"Failed PDFs to re-extract: {len(missing_df)}\n")

success = 0
failed = 0

for idx, row in tqdm(missing_df.iterrows(), total=len(missing_df), desc="Extracting"):
    doi = row['doi']
    pdf_filename = doi.replace('/', '_') + '.pdf'
    txt_file = Path(TEXT_OUTPUT) / f"{doi.replace('/', '_')}.txt"
    pdf_path = Path(PDF_FOLDER) / pdf_filename

    if not pdf_path.exists():
        failed += 1
        continue

    try:
        doc = fitz.open(str(pdf_path))
        text = ""
        for page_num in range(len(doc)):
            try:
                page = doc[page_num]
                page_text = page.get_text()
                if page_text:
                    text += page_text + "\n\n"
            except:
                continue
        doc.close()

        with open(txt_file, "w", encoding="utf-8") as f:
            f.write(text)

        text_length = len(text.strip())
        df.at[idx, 'text_length'] = text_length
        df.at[idx, 'pdf_title'] = extract_title_from_text(text)

        if text_length < 500:
            df.at[idx, 'extraction_method'] = 'ocr'
            df.at[idx, 'needs_ocr'] = True
            df.at[idx, 'is_scanned'] = True
        else:
            df.at[idx, 'extraction_method'] = 'text_extraction'
            df.at[idx, 'needs_ocr'] = False
            df.at[idx, 'is_scanned'] = False

        success += 1

    except Exception as e:
        failed += 1

df.to_csv(METADATA_CSV, index=False)

print("RE-EXTRACTION COMPLETE")
print(f"Successfully extracted: {success}")
print(f"Still failed: {failed}")
print(f"\nExtraction method distribution:")
print(df['extraction_method'].value_counts())

In [None]:
#EXTRACT METADATA FROM TEXT FILES

import re
from collections import Counter

print("EXTRACTING METADATA FROM TEXT FILES")

df = pd.read_csv(METADATA_CSV)
print(f"Loaded {len(df)} records\n")

def extract_year_from_text(text, doi):
    if not text or len(text) < 50:
        return None

    header = text[:10000]
    year_candidates = []

    patterns = [
        (r'(?:published|publication\s+year)[:\s]+(\d{4})', 3),
        (r'(?:copyright|©|\(c\))[:\s]*(\d{4})', 3),
        (r'(?:received|accepted)[:\s]+\w+[,\s]+(\d{4})', 2),
        (r'\b(\d{4})\s*;\s*\d+', 2),
    ]

    for pattern, weight in patterns:
        matches = re.findall(pattern, header, re.IGNORECASE)
        for match in matches:
            try:
                year = int(match)
                if 1950 <= year <= 2025:
                    year_candidates.extend([year] * weight)
            except:
                pass

    if year_candidates:
        return Counter(year_candidates).most_common(1)[0][0]
    return None

def extract_publication_from_text(text):
    if not text or len(text) < 50:
        return None

    header = text[:5000]
    lines = header.split('\n')

    patterns = [
        r'nature\s+(?:medicine|communications?|reviews?)',
        r'(?:jama|lancet|bmj|plos|nejm)',
        r'journal\s+of\s+[\w\s&-]{5,50}',
        r'(?:annals?|archives?)\s+of\s+[\w\s&-]{5,50}',
    ]

    for line in lines[:80]:
        line_clean = line.strip()
        if 15 < len(line_clean) < 200:
            for pattern in patterns:
                if re.search(pattern, line_clean, re.IGNORECASE):
                    return re.sub(r'\s+', ' ', line_clean)
    return None

def extract_citation_from_text(text):
    if not text or len(text) < 50:
        return None

    patterns = [
        r'cited\s+by[:\s]+(\d+)',
        r'(\d+)\s+citations?',
        r'times\s+cited[:\s]+(\d+)',
    ]

    for pattern in patterns:
        matches = re.findall(pattern, text[:15000], re.IGNORECASE)
        for match in matches:
            try:
                count = int(match)
                if 1 <= count < 100000:
                    return count
            except:
                pass
    return None

updated = {'year': 0, 'citation': 0, 'publication': 0}

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting metadata"):
    doi = row['doi']
    txt_file = Path(TEXT_OUTPUT) / f"{doi.replace('/', '_')}.txt"

    if not txt_file.exists():
        continue

    try:
        with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
    except:
        continue

    if len(text) < 50:
        continue

    if pd.isna(row['year']):
        year = extract_year_from_text(text, doi)
        if year:
            df.at[idx, 'year'] = float(year)
            updated['year'] += 1

    if pd.isna(row['publication']):
        pub = extract_publication_from_text(text)
        if pub:
            df.at[idx, 'publication'] = pub
            updated['publication'] += 1

    if pd.isna(row['citation_count']):
        cites = extract_citation_from_text(text)
        if cites:
            df.at[idx, 'citation_count'] = float(cites)
            updated['citation'] += 1

df.to_csv(METADATA_CSV, index=False)

print("METADATA EXTRACTION COMPLETE")
print(f"Updated years: {updated['year']}")
print(f"Updated publications: {updated['publication']}")
print(f"Updated citations: {updated['citation']}")

In [None]:
# FETCH METADATA FROM APIS

import requests
import time

def get_crossref_metadata(doi):
    try:
        url = f"https://api.crossref.org/works/{doi}"
        headers = {'User-Agent': f'mailto:{YOUR_EMAIL}'}
        response = requests.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            data = response.json()
            message = data.get('message', {})

            year = None
            published = message.get('published-print') or message.get('published-online')
            if published:
                date_parts = published.get('date-parts', [[]])[0]
                if date_parts:
                    year = date_parts[0]

            citations = message.get('is-referenced-by-count', 0)

            journal = None
            container = message.get('container-title', [])
            if container:
                journal = container[0]

            return year, citations, journal
    except:
        pass
    return None, None, None

def get_openalex_metadata(doi):
    try:
        url = f"https://api.openalex.org/works/doi:{doi}"
        headers = {'User-Agent': f'mailto:{YOUR_EMAIL}'}
        response = requests.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            data = response.json()

            year = data.get('publication_year')
            citations = data.get('cited_by_count', 0)

            journal = None
            primary_location = data.get('primary_location', {})
            if primary_location:
                source = primary_location.get('source')
                if source:
                    journal = source.get('display_name')

            return year, citations, journal
    except:
        pass
    return None, None, None

print("="*60)
print("FETCHING METADATA FROM APIs")
print("="*60)

df = pd.read_csv(METADATA_CSV)
api_updated = {'year': 0, 'citation': 0, 'publication': 0}

for idx, row in tqdm(df.iterrows(), total=len(df), desc="API lookup"):
    doi = row['doi']

    if pd.notna(row['year']) and pd.notna(row['citation_count']) and pd.notna(row['publication']):
        continue

    year_cr, cite_cr, journal_cr = get_crossref_metadata(doi)

    if pd.isna(row['year']) and year_cr:
        df.at[idx, 'year'] = float(year_cr)
        api_updated['year'] += 1

    if pd.isna(row['citation_count']) and cite_cr:
        df.at[idx, 'citation_count'] = float(cite_cr)
        api_updated['citation'] += 1

    if pd.isna(row['publication']) and journal_cr:
        df.at[idx, 'publication'] = journal_cr
        api_updated['publication'] += 1

    if pd.isna(df.at[idx, 'year']) or pd.isna(df.at[idx, 'citation_count']) or pd.isna(df.at[idx, 'publication']):
        year_oa, cite_oa, journal_oa = get_openalex_metadata(doi)

        if pd.isna(df.at[idx, 'year']) and year_oa:
            df.at[idx, 'year'] = float(year_oa)
            api_updated['year'] += 1

        if pd.isna(df.at[idx, 'citation_count']) and cite_oa:
            df.at[idx, 'citation_count'] = float(cite_oa)
            api_updated['citation'] += 1

        if pd.isna(df.at[idx, 'publication']) and journal_oa:
            df.at[idx, 'publication'] = journal_oa
            api_updated['publication'] += 1

    if (idx + 1) % 100 == 0:
        df.to_csv(METADATA_CSV, index=False)
        time.sleep(1)

df.to_csv(METADATA_CSV, index=False)

print("API FETCH COMPLETE")
print(f"Updated years: {api_updated['year']}")
print(f"Updated citations: {api_updated['citation']}")
print(f"Updated publications: {api_updated['publication']}")

In [None]:
# ASSIGN CATR=EGORIES FROM SOURCE FILES

print("ASSIGNING CATEGORIES FROM SOURCE FILES")

def normalize_doi(doi):
    if pd.isna(doi):
        return None
    doi_str = str(doi).strip().lower()
    doi_str = doi_str.replace('.', '/')
    doi_str = doi_str.rstrip(',').strip()
    return doi_str

def parse_source_file(content):
    pattern = r'(10\.\S+?)(success|failed)'
    matches = re.findall(pattern, content, re.IGNORECASE)
    return [(doi, status) for doi, status in matches]

df = pd.read_csv(METADATA_CSV)
print(f"Loaded {len(df)} records")

df['category'] = None

source_files = {
    'liver': LIVER_SOURCE,
    'lung': LUNG_SOURCE,
    'heart': HEART_SOURCE,
    'kidney': KIDNEY_SOURCE
}

doi_to_organ = {}
organ_stats = {}

for organ, filepath in source_files.items():
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()

        doi_status_pairs = parse_source_file(content)

        success_count = 0
        for doi, status in doi_status_pairs:
            if status.lower() == 'success':
                doi_normalized = normalize_doi(doi)
                if doi_normalized:
                    doi_to_organ[doi_normalized] = organ
                    success_count += 1

        organ_stats[organ] = success_count
        print(f"  {organ:10} : {success_count:4d} DOIs loaded")

    except FileNotFoundError:
        print(f"  {organ:10} : File not found")
        organ_stats[organ] = 0

print(f"\nTotal DOIs mapped: {len(doi_to_organ)}")

matched = 0
for idx, row in df.iterrows():
    doi_normalized = str(row['doi']).strip().lower()

    if doi_normalized in doi_to_organ:
        df.at[idx, 'category'] = doi_to_organ[doi_normalized]
        matched += 1

df['category'] = df['category'].fillna('unassigned')
df.to_csv(METADATA_CSV, index=False)

print(f"\nMatched: {matched}/{len(df)}")
print(f"\nCategory distribution:")
print(df['category'].value_counts())

In [None]:
# CLASSIFY UNASSIGNED PAPERS BY KEYWORDS

print("CLASSIFYING UNASSIGNED PAPERS BY KEYWORDS")

df = pd.read_csv(METADATA_CSV)
unassigned = df[df['category'] == 'unassigned']
print(f"Unassigned papers: {len(unassigned)}")

def keyword_classify(text):
    text_lower = text.lower()

    scores = {'liver': 0, 'lung': 0, 'heart': 0, 'kidney': 0}

    liver_terms = ['liver', 'hepatic', 'hepato', 'cirrhosis', 'bile', 'hepatitis']
    lung_terms = ['lung', 'pulmonary', 'respiratory', 'bronch', 'pneumo', 'airway']
    heart_terms = ['heart', 'cardiac', 'cardio', 'coronary', 'myocard', 'aortic']
    kidney_terms = ['kidney', 'renal', 'nephro', 'dialysis', 'glomerular', 'urinary']

    for term in liver_terms:
        scores['liver'] += len(re.findall(r'\b' + term, text_lower))
    for term in lung_terms:
        scores['lung'] += len(re.findall(r'\b' + term, text_lower))
    for term in heart_terms:
        scores['heart'] += len(re.findall(r'\b' + term, text_lower))
    for term in kidney_terms:
        scores['kidney'] += len(re.findall(r'\b' + term, text_lower))

    if max(scores.values()) > 0:
        return max(scores, key=scores.get)
    return 'other'

text_dir = Path(TEXT_OUTPUT)

for idx, row in unassigned.iterrows():
    doi = row['doi']
    txt_file = text_dir / f"{doi.replace('/', '_')}.txt"

    if txt_file.exists():
        try:
            with open(txt_file, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()[:5000]
            organ = keyword_classify(text)
            df.at[idx, 'category'] = organ
        except:
            df.at[idx, 'category'] = 'other'
    else:
        df.at[idx, 'category'] = 'other'

df.to_csv(METADATA_CSV, index=False)

print(f"\nFinal category distribution:")
print(df['category'].value_counts())

CLASSIFYING UNASSIGNED PAPERS BY KEYWORDS


NameError: name 'pd' is not defined

In [None]:
# CLEAN CITATION COUNT COLUMN

print("CLEANING CITATION COUNT COLUMN")

df = pd.read_csv(METADATA_CSV)

cleaned_count = 0
for idx, row in df.iterrows():
    year = row['year']
    cite = row['citation_count']

    if pd.notna(year) and pd.notna(cite):
        if int(float(year)) == int(float(cite)):
            df.at[idx, 'citation_count'] = None
            cleaned_count += 1

print(f"Cleaned {cleaned_count} incorrect citation values")
df.to_csv(METADATA_CSV, index=False)

In [None]:
# FINAL STATISTICS

print("FINAL DATASET STATISTICS")

df = pd.read_csv(METADATA_CSV)

print(f"\nTotal papers: {len(df)}")

print(f"\nMetadata coverage:")
print(f"  Year: {df['year'].notna().sum()}/{len(df)} ({df['year'].notna().sum()/len(df)*100:.1f}%)")
print(f"  Citation: {df['citation_count'].notna().sum()}/{len(df)} ({df['citation_count'].notna().sum()/len(df)*100:.1f}%)")
print(f"  Publication: {df['publication'].notna().sum()}/{len(df)} ({df['publication'].notna().sum()/len(df)*100:.1f}%)")

print(f"\nCategory distribution:")
for cat, count in df['category'].value_counts().items():
    print(f"  {cat:12}: {count:5d} ({count/len(df)*100:.1f}%)")

if df['year'].notna().sum() > 0:
    print(f"\nYear range: {int(df['year'].min())} - {int(df['year'].max())}")

print(f"\nExtraction method distribution:")
print(df['extraction_method'].value_counts())

print("DATASET READY")

In [None]:
# DEPENDENCIES FOR DEEPSEEK

!pip install -q torch torchvision
!pip install -q transformers accelerate safetensors
!pip install -q pdf2image pillow
!apt-get install -q poppler-utils

In [None]:
# AUTHENTICATE WITH HUGGING FACE

from huggingface_hub import login

HF_TOKEN = "HF_TOKEN"
login(HF_TOKEN)

In [None]:
# LOAD DEEPSEEK MODEL

import torch
from transformers import AutoModelForCausalLM, AutoProcessor

print("Loading DeepSeek-VL2 model.")
print("This may take 5-10 minutes on first run.")

model_id = "deepseek-ai/deepseek-vl2-tiny"

processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto"
)

print("Model loaded successfully!")

In [None]:
# DEFINE DEEPSEEK OCR FUNCTIONS

from PIL import Image
import pdf2image

def ocr_image_with_deepseek(image, model, processor):
    """Perform OCR on a single image using DeepSeek-VL2"""

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": "Extract and transcribe all text from this image. Output only the extracted text, nothing else."}
            ]
        }
    ]

    inputs = processor(
        messages,
        return_tensors="pt",
        padding=True
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2048,
            do_sample=False,
            pad_token_id=processor.tokenizer.pad_token_id
        )

    response = processor.decode(outputs[0], skip_special_tokens=True)

    if "assistant" in response.lower():
        response = response.split("assistant")[-1].strip()

    return response

def ocr_pdf_with_deepseek(pdf_path, model, processor, max_pages=20):
    """Perform OCR on all pages of a PDF using DeepSeek-VL2"""

    try:
        images = pdf2image.convert_from_path(str(pdf_path), dpi=150)
    except Exception as e:
        print(f"Error converting PDF: {e}")
        return None

    if max_pages:
        images = images[:max_pages]

    all_text = []

    for i, image in enumerate(images):
        print(f"  Processing page {i+1}/{len(images)}...", end="\r")
        try:
            page_text = ocr_image_with_deepseek(image, model, processor)
            if page_text:
                all_text.append(f"--- Page {i+1} ---\n{page_text}")
        except Exception as e:
            print(f"  Error on page {i+1}: {e}")
            continue

    print(f"  Processed {len(images)} pages.          ")

    return "\n\n".join(all_text)

In [None]:
# PROCESS SCANNED PDFS WITH DEEPSEEK OCR

import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Make sure these match your configuration
PDF_FOLDER = "/content/drive/MyDrive/CapstoneProject/Capstone/papers"
TEXT_OUTPUT = "/content/drive/MyDrive/CapstoneProject/Capstone/text_extraction"
METADATA_CSV = "/content/drive/MyDrive/CapstoneProject/Capstone/metadata_new.csv"

df = pd.read_csv(METADATA_CSV)

# Find PDFs that need OCR (text_length < 500)
needs_ocr = df[df['text_length'] < 500].copy()

print(f"{'='*60}")
print(f"PROCESSING {len(needs_ocr)} SCANNED PDFs WITH DEEPSEEK OCR")
print(f"{'='*60}\n")

ocr_success = 0
ocr_failed = 0

for idx, row in needs_ocr.iterrows():
    doi = row['doi']
    pdf_filename = doi.replace('/', '_') + '.pdf'
    pdf_path = Path(PDF_FOLDER) / pdf_filename
    txt_file = Path(TEXT_OUTPUT) / f"{doi.replace('/', '_')}.txt"

    if not pdf_path.exists():
        print(f"PDF not found: {doi}")
        ocr_failed += 1
        continue

    print(f"\nProcessing: {doi}")

    try:
        ocr_text = ocr_pdf_with_deepseek(pdf_path, model, processor, max_pages=20)

        if ocr_text and len(ocr_text.strip()) > 100:
            with open(txt_file, 'w', encoding='utf-8') as f:
                f.write(ocr_text)

            df.at[idx, 'text_length'] = len(ocr_text.strip())
            df.at[idx, 'extraction_method'] = 'deepseek_ocr'
            df.at[idx, 'needs_ocr'] = False

            ocr_success += 1
            print(f"  ✓ Success: {len(ocr_text)} characters extracted")
        else:
            ocr_failed += 1
            print(f"  ✗ Failed: No text extracted")

    except Exception as e:
        ocr_failed += 1
        print(f"  ✗ Error: {str(e)[:50]}")

    # Save progress every 5 files
    if (ocr_success + ocr_failed) % 5 == 0:
        df.to_csv(METADATA_CSV, index=False)
        print(f"\n  Progress saved. Success: {ocr_success}, Failed: {ocr_failed}")

# Final save
df.to_csv(METADATA_CSV, index=False)

print(f"\n{'='*60}")
print("DEEPSEEK OCR COMPLETE")
print(f"{'='*60}")
print(f"Successfully OCR'd: {ocr_success}")
print(f"Failed: {ocr_failed}")
print(f"\nExtraction method distribution:")
print(df['extraction_method'].value_counts())