# Libraries and packages

In [None]:
%pip install tqdm requests python-dotenv
%pip install PyPDF2 


# Variable Settings

In [12]:
import os
from dotenv import load_dotenv

load_dotenv()

# === CONFIG ===
FOLDER_PATH = os.getenv("FOLDER_PATH")  # this pulls from an .env file or just replace with your folder path after the = sign
OUTPUT_CSV = 'bibliography_metadata.csv'
LOG_FILE = 'processing_log.txt'
YOUR_EMAIL = os.getenv("YOUR_EMAIL")    # CrossRef best practice this pulls from an .env file or just replace with your email after the = sign
SIMILARITY_THRESHOLD = 0.6  # Minimum title similarity (0-1 scale)

print(f"Email for CrossRef: {YOUR_EMAIL}")
print(f"Path to Folder {FOLDER_PATH}")

Email for CrossRef: mrhallonline@gmail.com
Path to Folder /Users/kevinhall/Library/CloudStorage/Dropbox/Grad School/2025 Spring/MyResearch/Research_PDFs


In [None]:
import os
import csv
import re
import requests
import time
from PyPDF2 import PdfReader
from tqdm import tqdm
from difflib import SequenceMatcher

# === Check if folder exists ===
if not os.path.exists(FOLDER_PATH):
    os.makedirs(FOLDER_PATH)
    print(f"📂 Created missing folder: {FOLDER_PATH}")

# === Helper function to sanitize file names ===
def sanitize_filename(s):
    return re.sub(r'[\\/*?:"<>|]', "", s).replace(" ", "_")

# === CrossRef query ===
def query_crossref(title, max_results=1):
    url = "https://api.crossref.org/works"
    params = {
        "query.bibliographic": title,
        "rows": max_results,
        "select": "title,author,issued,container-title,DOI",
        "mailto": YOUR_EMAIL
    }

    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        if data['message']['items']:
            item = data['message']['items'][0]
            authors = item.get('author', [])
            author_names = ", ".join([f"{a.get('family', '')}, {a.get('given', '')}" for a in authors])
            result_title = item['title'][0] if 'title' in item and item['title'] else ''
            year = item['issued']['date-parts'][0][0] if 'issued' in item else ''
            journal = item['container-title'][0] if 'container-title' in item and item['container-title'] else ''
            doi = item.get('DOI', '')

            return {
                'Author': author_names if author_names else 'UnknownAuthor',
                'Title': result_title if result_title else 'UnknownTitle',
                'Year': str(year) if year else 'UnknownYear',
                'Journal/Source': journal,
                'DOI': doi
            }
    except Exception as e:
        log(f"CrossRef query failed for title '{title}': {e}")
    
    return None

# === Logging function ===
def log(message):
    with open(os.path.join(FOLDER_PATH, LOG_FILE), 'a', encoding='utf-8') as f:
        f.write(f"{message}\n")
    print(message)

# === Main processing ===
bibliography_data = []

# Count total PDF files
total_pdfs = sum(len(files) for _, _, files in os.walk(FOLDER_PATH) if any(f.lower().endswith('.pdf') for f in files))

with tqdm(total=total_pdfs, desc="Processing PDFs") as pbar:
    for dirpath, dirnames, filenames in os.walk(FOLDER_PATH):
        for file in filenames:
            if file.lower().endswith('.pdf'):
                full_path = os.path.join(dirpath, file)
                log(f"\n📄 Processing: {full_path}")

                # Initial metadata extraction
                try:
                    reader = PdfReader(full_path)
                    metadata = reader.metadata
                except Exception as e:
                    log(f"⚠️ Could not read {file}: {e}")
                    metadata = {}

                author = metadata.get('/Author', '').strip()
                title = metadata.get('/Title', '').strip()
                year_match = re.search(r'(19|20)\d{2}', metadata.get('/CreationDate', '') or file)
                year = year_match.group(0) if year_match else ''

                fallback_title = os.path.splitext(file)[0]

                # Use CrossRef if needed
                use_crossref = not author or not title or not year
                journal = ''
                doi = ''

                if use_crossref:
                    query_title = title if title else fallback_title
                    log(f"🔍 Querying CrossRef for: '{query_title}'")
                    crossref_result = query_crossref(query_title)
                    time.sleep(1)  # Be polite!

                    if crossref_result:
                        similarity = SequenceMatcher(None, query_title.lower(), crossref_result['Title'].lower()).ratio()
                        log(f"CrossRef title match similarity: {similarity:.2f}")

                        if similarity >= SIMILARITY_THRESHOLD:
                            log(f"✅ CrossRef match accepted for '{query_title}'.")
                            author = crossref_result['Author']
                            title = crossref_result['Title']
                            year = crossref_result['Year']
                            journal = crossref_result['Journal/Source']
                            doi = crossref_result['DOI']
                        else:
                            log(f"⚠️ CrossRef match rejected (similarity too low). Keeping fallback metadata.")
                            author = author if author else 'UnknownAuthor'
                            title = title if title else fallback_title
                            year = year if year else 'UnknownYear'
                    else:
                        log(f"⚠️ No CrossRef result. Keeping fallback metadata.")
                        author = author if author else 'UnknownAuthor'
                        title = title if title else fallback_title
                        year = year if year else 'UnknownYear'
                else:
                    log("✅ Metadata from PDF is sufficient.")

                # Sanitize + rename
                title_key = sanitize_filename(title[:40])
                new_filename = f"{sanitize_filename(author)}_{year}_{title_key}.pdf"
                new_full_path = os.path.join(dirpath, new_filename)

                try:
                    os.rename(full_path, new_full_path)
                    log(f"✅ Renamed to: {new_filename}")
                except Exception as e:
                    log(f"⚠️ Failed to rename {file}: {e}")

                # Save metadata
                bibliography_data.append({
                    'Document Name': new_filename,
                    'Original Folder': dirpath,
                    'Author': author,
                    'Year': year,
                    'Title': title,
                    'Journal/Source': journal,
                    'DOI': doi
                })

                pbar.update(1)

# === Write CSV ===
csv_path = os.path.join(FOLDER_PATH, OUTPUT_CSV)
with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Document Name', 'Original Folder', 'Author', 'Year', 'Title', 'Journal/Source', 'DOI']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for entry in bibliography_data:
        writer.writerow(entry)

log(f"\n✅ Bibliographic metadata CSV saved to: {csv_path}")
log(f"📝 Log file saved to: {os.path.join(FOLDER_PATH, LOG_FILE)}")
