In [1]:
import os
from dotenv import load_dotenv
from pyzotero import zotero

load_dotenv()

ZOTERO_USER_ID = os.getenv('ZOTERO_USER_ID')
ZOTERO_API_KEY = os.getenv('ZOTERO_API_KEY')

zot = zotero.Zotero(ZOTERO_USER_ID, 'user', ZOTERO_API_KEY)

In [10]:
pdf_list = os.listdir('/home/mlevi/OneDrive/Zotero')

In [None]:
from difflib import SequenceMatcher

def get_similarity(str1, str2):
    return SequenceMatcher(None, str1.lower(), str2.lower()).ratio()

def find_best_match(title, pdf_list, threshold=0.5):
    best_match = None
    best_score = 0

    for pdf in pdf_list:
        pdf_name = pdf.replace('.pdf', '').replace('et al.', '')
        similarity = get_similarity(title, pdf_name)

        if similarity > best_score:
            best_score = similarity
            best_match = pdf

    if best_score >= threshold:
        return best_match, best_score
    else:
        return None, best_score

In [None]:
matched_count = 0
unmatched_count = 0
low_confidence_matches = []
titles = []
pdf_names = []

for metadata in zot.everything(zot.collection_items_top('SVUM4G2M', limit=106)):
    if 'data' in metadata and 'title' in metadata['data']:
        title = metadata['data']['title']

        best_match, similarity = find_best_match(title, pdf_list, threshold=0.5)

        if best_match:
            matched_count += 1
            print(f"✓ Match (score: {similarity:.2f}): {title[:60]}... → {best_match}")
            titles.append(title)
            pdf_names.append(best_match)

            if similarity < 0.7:
                low_confidence_matches.append((title, best_match, similarity))
        else:
            unmatched_count += 1
            print(f"✗ No match (best score: {similarity:.2f}): {title[:60]}...")
            titles.append(title)
            pdf_names.append('')

print(f"\n{'='*80}")
print(f"Summary: {matched_count} matched, {unmatched_count} unmatched")
print(f"Low confidence matches (< 0.7): {len(low_confidence_matches)}")

if low_confidence_matches:
    print(f"\n{'='*80}")
    print("Low confidence matches to review:")
    for title, pdf, score in low_confidence_matches[:10]:
        print(f"  Score {score:.2f}: {title[:50]}... → {pdf}")

✓ Match (score: 0.67): Facial expression synthesis based on denoising diffusion pro... → Ho et al. - 2020 - Denoising Diffusion Probabilistic Models.pdf
✓ Match (score: 1.00): Uncertainty-Aware Semi-Supervised Learning of 3D Face Riggin... → Uncertainty-Aware Semi-Supervised Learning of 3D Face Rigging from Single Image.pdf
✓ Match (score: 1.00): Talking Face Generation with Expression-Tailored Generative ... → Talking Face Generation with Expression-Tailored Generative Adversarial Network.pdf
✓ Match (score: 0.99): Synthesis of Facial Expressions in Photographs: Characterist... → Synthesis of Facial Expressions in Photographs Characteristics, Approaches, and Challenges.pdf
✓ Match (score: 0.88): Reviving Intentional Facial Expressions: an Interface for AL... → Reviving Intentional Facial Expressions an Interface for ALS Patients using Brain Decoding and Imag.pdf
✓ Match (score: 1.00): Fine-grained Micro-Expression Generation based on Thin-Plate... → Fine-grained Micro-Expression Gener

In [10]:
import pandas as pd

pd.DataFrame({'title': titles, 'pdf_name': pdf_names}).to_csv('zotero_pdf_matches.csv', index=False)

In [None]:
import pandas as pd

library = pd.read_csv('zotero_pdf_matches.csv', encoding='windows-1252')
library.head()

0
1
2
3
4


In [15]:
from zotero_integration import pull_from_zotero

pull_from_zotero()[4]

ImportError: cannot import name 'pull_from_zotero' from 'zotero_integration' (/home/mlevi/Work/research-assistant/src/data/zotero_integration.py)