In [None]:
# ============================================
# STEP 1: TEXT EXTRACTION + METADATA CSV
# Saves as {doi}.txt
# CSV columns: pdf_title, doi, file_size_mb, text_length,
#              is_scanned, needs_ocr, extraction_method
# ============================================

import pypdfium2 as pdfium
from pathlib import Path
from tqdm import tqdm
import time
import pandas as pd
import re

PDF_FOLDER = "/content/drive/MyDrive/Capstone/papers"
TEXT_OUTPUT = "/content/drive/MyDrive/Capstone/extracted_text"
METADATA_CSV = "/content/drive/MyDrive/Capstone/pdf_metadata.csv"

Path(TEXT_OUTPUT).mkdir(parents=True, exist_ok=True)

all_pdfs = list(Path(PDF_FOLDER).glob('**/*.pdf'))

print("="*60)
print("STEP 1: TEXT EXTRACTION + METADATA")
print("="*60)
print(f"Total PDFs: {len(all_pdfs)}\n")

def extract_doi_from_filename(filename):
    """Extract DOI from filename like '10.1186_1471-2482-8-2.pdf'"""
    name = filename.replace('.pdf', '')
    doi = name.replace('_', '/')
    return doi

def extract_title_from_text(text):
    """Extract likely title from first part of text"""
    lines = text.strip().split('\n')
    for line in lines[:10]:
        line = line.strip()
        if len(line) > 20 and len(line) < 200:
            return line
    return "Unknown"

metadata_records = []
start_time = time.time()

for idx, pdf_path in enumerate(tqdm(all_pdfs, desc="Extracting", unit="pdf")):

    doi = extract_doi_from_filename(pdf_path.name)
    txt_file = Path(TEXT_OUTPUT) / f"{doi.replace('/', '_')}.txt"

    record = {
        'pdf_title': 'Unknown',
        'doi': doi,
        'file_size_mb': round(pdf_path.stat().st_size / (1024 * 1024), 2),
        'text_length': 0,
        'is_scanned': False,
        'needs_ocr': False,
        'extraction_method': 'none'
    }

    if txt_file.exists():
        with open(txt_file, 'r', encoding='utf-8') as f:
            text = f.read()

        record['text_length'] = len(text.strip())
        record['extraction_method'] = 'pypdfium2'
        record['pdf_title'] = extract_title_from_text(text)

        if record['text_length'] < 500:
            record['is_scanned'] = True
            record['needs_ocr'] = True

        metadata_records.append(record)
        continue

    try:
        pdf = pdfium.PdfDocument(pdf_path)
        text = ""

        for i in range(len(pdf)):
            page = pdf[i]
            textpage = page.get_textpage()
            page_text = textpage.get_text_range()
            if page_text:
                text += page_text + "\n\n"

        pdf.close()

        with open(txt_file, "w", encoding="utf-8") as f:
            f.write(text)

        record['text_length'] = len(text.strip())
        record['extraction_method'] = 'pypdfium2'
        record['pdf_title'] = extract_title_from_text(text)

        if record['text_length'] < 500:
            record['is_scanned'] = True
            record['needs_ocr'] = True

    except Exception as e:
        record['extraction_method'] = 'failed'
        record['needs_ocr'] = True
        record['is_scanned'] = True

    metadata_records.append(record)

metadata_df = pd.DataFrame(metadata_records)
metadata_df.to_csv(METADATA_CSV, index=False)

elapsed_time = time.time() - start_time
needs_ocr_count = len(metadata_df[metadata_df['needs_ocr'] == True])

print(f"\n{'='*60}")
print("STEP 1 COMPLETE")
print(f"{'='*60}")
print(f"Total PDFs: {len(metadata_df)}")
print(f"Need OCR: {needs_ocr_count}")
print(f"Time: {elapsed_time/60:.1f} minutes")
print(f"\nFiles saved as: {{doi}}.txt")
print(f"Text folder: {TEXT_OUTPUT}")
print(f"Metadata CSV: {METADATA_CSV}")

print(f"\n{'='*60}")
print("CSV PREVIEW - ALL PDFs")
print(f"{'='*60}")
print(metadata_df.head(10).to_string(index=False))

print(f"\n{'='*60}")
print("STATISTICS")
print(f"{'='*60}")
print(f"is_scanned = True: {len(metadata_df[metadata_df['is_scanned'] == True])}")
print(f"needs_ocr = True: {len(metadata_df[metadata_df['needs_ocr'] == True])}")
print(f"\nText length distribution:")
print(metadata_df['text_length'].describe())

if needs_ocr_count > 0:
    print(f"\n{'='*60}")
    print(f"SCANNED PDFs (needs_ocr=True): {needs_ocr_count}")
    print(f"{'='*60}")
    scanned_preview = metadata_df[metadata_df['needs_ocr'] == True][['pdf_title', 'doi', 'text_length', 'is_scanned', 'needs_ocr']]
    print(scanned_preview.head(20).to_string(index=False))
    print(f"\nRun STEP 2 to OCR these {needs_ocr_count} PDFs")
else:
    print("\nAll PDFs have sufficient text - no OCR needed")

STEP 1: TEXT EXTRACTION + METADATA
Total PDFs: 1764



Extracting: 100%|██████████| 1764/1764 [02:34<00:00, 11.43pdf/s]


STEP 1 COMPLETE
Total PDFs: 1764
Need OCR: 5
Time: 2.6 minutes

Files saved as: {doi}.txt
Text folder: /content/drive/MyDrive/Capstone/extracted_text
Metadata CSV: /content/drive/MyDrive/Capstone/pdf_metadata.csv

CSV PREVIEW - ALL PDFs
                                                                                               pdf_title                          doi  file_size_mb  text_length  is_scanned  needs_ocr extraction_method
                                                                 (page number not for citation purposes)        10.1186/1471-2482-8-2          0.21        31149       False      False         pypdfium2
                                           416 Hong Kong Med J Vol 19 No 5 # October 2013 # www.hkmj.org          10.12809/hkmj133793          0.96        34901       False      False         pypdfium2
                                                                 Two types of fatigue in cancer patients         10.1038/bjc.2011.528          0.10         




In [None]:
# ============================================
# STEP 2: TESSERACT OCR FOR SCANNED PDFs
# Replaces low-quality text files with OCR
# Updates metadata CSV
# ============================================

import subprocess
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import time

# Install OCR dependencies FIRST
print("Installing OCR tools...")
subprocess.run(["apt-get", "update"], capture_output=True)
subprocess.run(["apt-get", "install", "-y", "tesseract-ocr", "poppler-utils"],
               capture_output=True)
subprocess.run(["pip", "install", "pytesseract", "pdf2image", "Pillow"],
               capture_output=True)

print("OCR tools installed\n")

# NOW import the modules
from PIL import Image
import pytesseract
import pdf2image

# Paths
PDF_FOLDER = "/content/drive/MyDrive/Capstone/papers"
TEXT_OUTPUT = "/content/drive/MyDrive/Capstone/extracted_text"
METADATA_CSV = "/content/drive/MyDrive/Capstone/pdf_metadata.csv"

# Load metadata
metadata_df = pd.read_csv(METADATA_CSV)
needs_ocr_df = metadata_df[metadata_df['needs_ocr'] == True].copy()

print("="*60)
print("STEP 2: TESSERACT OCR")
print("="*60)
print(f"PDFs to OCR: {len(needs_ocr_df)}\n")

if len(needs_ocr_df) == 0:
    print("No PDFs need OCR - all done")
else:
    ocr_success = 0
    ocr_failed = 0
    start_time = time.time()

    for idx, row in tqdm(needs_ocr_df.iterrows(), total=len(needs_ocr_df),
                         desc="OCR", unit="pdf"):

        doi = row['doi']
        pdf_filename = doi.replace('/', '_') + '.pdf'
        txt_file = Path(TEXT_OUTPUT) / f"{doi.replace('/', '_')}.txt"

        # Find PDF
        pdf_matches = list(Path(PDF_FOLDER).rglob(pdf_filename))
        if not pdf_matches:
            ocr_failed += 1
            continue

        pdf_path = pdf_matches[0]

        try:
            # Convert PDF to images
            images = pdf2image.convert_from_path(str(pdf_path), dpi=200)

            # OCR each page
            ocr_text = ""
            for i, image in enumerate(images):
                page_text = pytesseract.image_to_string(image, lang='eng')
                ocr_text += page_text + "\n\n"

            # Replace text file
            with open(txt_file, "w", encoding="utf-8") as f:
                f.write(ocr_text)

            # Update metadata
            metadata_df.loc[idx, 'text_length'] = len(ocr_text.strip())
            metadata_df.loc[idx, 'extraction_method'] = 'tesseract_ocr'

            # Update title
            lines = ocr_text.strip().split('\n')
            for line in lines[:10]:
                line = line.strip()
                if len(line) > 20 and len(line) < 200:
                    metadata_df.loc[idx, 'pdf_title'] = line
                    break

            ocr_success += 1

        except Exception as e:
            ocr_failed += 1
            metadata_df.loc[idx, 'extraction_method'] = f'ocr_failed: {str(e)[:50]}'

        if (idx + 1) % 5 == 0:
            elapsed = time.time() - start_time
            rate = (idx + 1) / elapsed
            eta = (len(needs_ocr_df) - idx - 1) / rate if rate > 0 else 0
            print(f"\nOCR: {idx + 1}/{len(needs_ocr_df)}")
            print(f"Success: {ocr_success} | Failed: {ocr_failed}")
            print(f"Elapsed: {elapsed/60:.1f} min | ETA: {eta/60:.1f} min\n")

    # Save updated metadata
    metadata_df.to_csv(METADATA_CSV, index=False)

    elapsed_time = time.time() - start_time

    print(f"\n{'='*60}")
    print("STEP 2 COMPLETE")
    print(f"{'='*60}")
    print(f"OCR success: {ocr_success}")
    print(f"OCR failed: {ocr_failed}")
    print(f"Time: {elapsed_time/60:.1f} minutes")

# Final summary
print(f"\n{'='*60}")
print("FINAL SUMMARY")
print(f"{'='*60}")

total_files = len(list(Path(TEXT_OUTPUT).glob('*.txt')))
print(f"Total text files: {total_files}")
print(f"\nExtraction methods:")
print(metadata_df['extraction_method'].value_counts().to_string())
print(f"\nText files: {TEXT_OUTPUT}")
print(f"Metadata: {METADATA_CSV}")
print("\nAll PDFs processed and ready for analysis")
print("="*60)

Installing OCR tools...
OCR tools installed

STEP 2: TESSERACT OCR
PDFs to OCR: 5



OCR:  80%|████████  | 4/5 [02:54<00:40, 40.71s/pdf]


OCR: 1565/5
Success: 4 | Failed: 0
Elapsed: 2.9 min | ETA: -2.9 min



OCR: 100%|██████████| 5/5 [02:55<00:00, 35.07s/pdf]


STEP 2 COMPLETE
OCR success: 5
OCR failed: 0
Time: 2.9 minutes

FINAL SUMMARY
Total text files: 1764

Extraction methods:
extraction_method
pypdfium2        1759
tesseract_ocr       5

Text files: /content/drive/MyDrive/Capstone/extracted_text
Metadata: /content/drive/MyDrive/Capstone/pdf_metadata.csv

All PDFs processed and ready for analysis



