# Payper Boy Protoype

## Step 1: Fetching Latest articles using crawl4ai

In [None]:
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun("https://arxiv.org/catchup/cs/2025-04-08")
        print(result.markdown[:3000])  # Print first 300 chars

await main()

## Step 2: Extracting article details

In [None]:
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, JsonCssExtractionStrategy
import datetime
import re
import nest_asyncio
import traceback

# Apply nest_asyncio to allow running asyncio in Jupyter
nest_asyncio.apply()

# --- Schema 1: Extract metadata from <dd> ---
schema_dd = {
    "name": "ArXiv Article Metadata",
    "baseSelector": "dl#articles > dd", # Target the <dd> element directly
    "fields": [
        {
            "name": "raw_title",
            "selector": "div.list-title", # Selector relative to <dd>
            "type": "text",
            "default": None
        },
        {
            "name": "authors",
            "selector": "div.list-authors a", # Selector relative to <dd>
            "type": "list",
            "fields": [
                {"name": "author_name", "type": "text"}
            ]
        },
        {
            "name": "raw_subjects",
            "selector": "div.list-subjects", # Selector relative to <dd>
            "type": "text",
            "default": None
        },
        {
            "name": "primary_subject",
            "selector": "div.list-subjects span.primary-subject", # Selector relative to <dd>
            "type": "text",
            "default": None
        },
        {
            "name": "raw_comments",
            "selector": "div.list-comments", # Selector relative to <dd>
            "type": "text",
            "default": None
        },
        {
            "name": "raw_journal_ref",
            "selector": "div.list-journal-ref", # Selector relative to <dd>
            "type": "text",
            "default": None
        }
    ]
}

# --- Schema 2: Extract IDs and Links from <dt> ---
schema_dt = {
    "name": "ArXiv Article Links",
    "baseSelector": "dl#articles > dt", # Target the <dt> element
    "fields": [
        {
            "name": "arxiv_id_from_href",
            "selector": "a[title='Abstract']",
            "type": "attribute",
            "attribute": "href",
            "regex": r"/abs/([^/]+)",  # More permissive regex to catch various ID formats
            "default": None
        },
        {
            "name": "arxiv_id_from_text",
            "selector": "a[title='Abstract']",
            "type": "text",
            "regex": r"arXiv:([^\s]+)",  # More permissive regex
            "default": None
        },
        {
            "name": "abstract_url_rel",
            "selector": "a[title='Abstract']",
            "type": "attribute",
            "attribute": "href",
            "default": None
        },
        {
            "name": "pdf_url_rel",
            "selector": "a[title='Download PDF']",
            "type": "attribute",
            "attribute": "href",
            "default": None
        },
         {
            "name": "html_url",
            "selector": "a[title='View HTML']",
            "type": "attribute",
            "attribute": "href",
            "default": None
        }
    ]
}

# --- Schema 3: Try to detect section headers (optional enhancement) ---
schema_headers = {
    "name": "ArXiv Section Headers",
    "baseSelector": "dl#articles > dt.newheader", 
    "fields": [
        {
            "name": "header_text",
            "selector": "",  # Element itself
            "type": "text",
            "default": None
        },
        {
            "name": "position",
            "selector": "",
            "type": "attribute",
            "attribute": "data-position",  # We'll add this attribute during processing
            "default": None
        }
    ]
}

# --- Helper Function for Post-Processing (Improved) ---
def process_and_merge_articles(dt_data_list, dd_data_list, base_url="https://arxiv.org"):
    """Merges dt and dd data and cleans fields with improved ID extraction."""
    merged_articles = []
    num_items = min(len(dt_data_list), len(dd_data_list))
    print(f"Attempting to merge {num_items} dt/dd pairs.")
    
    id_extraction_failures = 0
    
    for i in range(num_items):
        dt_data = dt_data_list[i]
        dd_data = dd_data_list[i]
        processed = {}

        # --- Improved ArXiv ID Handling ---
        id_from_href_raw = dt_data.get('arxiv_id_from_href')
        id_from_text_raw = dt_data.get('arxiv_id_from_text')
        canonical_id = None
        
        # Try to extract the ID using various methods
        for id_source in [id_from_href_raw, id_from_text_raw]:
            if not id_source:
                continue
                
            # Try standard format YYMM.NNNNN or YYMM.NNNNNvN
            std_match = re.search(r"(\d{4}\.\d{5}(?:v\d+)?)", id_source)
            if std_match:
                canonical_id = std_match.group(1)
                break
                
            # Try alternate format for older papers
            alt_match = re.search(r"([a-z-]+(?:\.[A-Z]{2})?\/\d{7}(?:v\d+)?)", id_source)
            if alt_match:
                canonical_id = alt_match.group(1)
                break
                
            # Last resort: try to extract any sequence that might be an ID
            last_resort_match = re.search(r"([^\/\s]+)(?:v\d+)?$", id_source)
            if last_resort_match:
                canonical_id = last_resort_match.group(1)
                break

        if not canonical_id:
            id_extraction_failures += 1
            context_hint = dd_data.get('raw_title', 'N/A')[:50] or dt_data.get('abstract_url_rel', 'N/A') or f'Entry {i+1}'
            print(f"Warning: Skipping entry {i+1}, failed to extract valid canonical arXiv ID. Context hint: {context_hint}")
            print(f"  - href source: {id_from_href_raw}")
            print(f"  - text source: {id_from_text_raw}")
            continue

        processed['arxiv_id'] = f"arXiv:{canonical_id}"

        # --- URL Construction (from dt_data) ---
        abstract_rel = dt_data.get('abstract_url_rel')
        processed['abstract_url'] = f"{base_url}{abstract_rel}" if abstract_rel else f"{base_url}/abs/{canonical_id}"

        pdf_rel = dt_data.get('pdf_url_rel')
        if pdf_rel:
            processed['pdf_url'] = f"{base_url}{pdf_rel}"

        html_url = dt_data.get('html_url')
        if html_url:
            if isinstance(html_url, str) and not html_url.startswith(('http://', 'https://')):
                if not html_url.startswith('/'):
                    html_url = '/' + html_url
                processed['html_url'] = f"{base_url}{html_url}"
            else:
                processed['html_url'] = html_url

        # --- Text Cleaning (from dd_data) ---
        raw_title = dd_data.get('raw_title', '')
        processed['title'] = re.sub(r'^Title:\s*', '', raw_title, flags=re.IGNORECASE).strip() if raw_title else None

        raw_subjects = dd_data.get('raw_subjects', '')
        processed['subjects'] = re.sub(r'^Subjects:\s*', '', raw_subjects, flags=re.IGNORECASE).strip() if raw_subjects else None

        raw_comments = dd_data.get('raw_comments', '')
        processed['comments'] = re.sub(r'^Comments:\s*', '', raw_comments, flags=re.IGNORECASE).strip() if raw_comments else None

        raw_journal_ref = dd_data.get('raw_journal_ref', '')
        processed['journal_ref'] = re.sub(r'^Journal-ref:\s*', '', raw_journal_ref, flags=re.IGNORECASE).strip() if raw_journal_ref else None

        processed['primary_subject'] = dd_data.get('primary_subject')

        # --- Author List (from dd_data) ---
        author_list = dd_data.get('authors', [])
        processed['authors'] = [auth.get('author_name') for auth in author_list if auth.get('author_name')]

        # --- Ensure essential keys exist ---
        processed.setdefault('title', None)
        processed.setdefault('subjects', None)
        processed.setdefault('primary_subject', None)
        processed.setdefault('comments', None)
        processed.setdefault('journal_ref', None)
        processed.setdefault('authors', [])
        processed.setdefault('pdf_url', None)
        processed.setdefault('html_url', None)

        merged_articles.append(processed)
    
    # Print summary of failures
    if id_extraction_failures:
        print(f"Total ID extraction failures: {id_extraction_failures} out of {num_items} ({id_extraction_failures/num_items*100:.1f}%)")
    
    return merged_articles

# --- Asynchronous Function to Crawl and Extract (Modified) ---
async def get_arxiv_new_submissions(target_date: str, collect_all=True):
    """
    Fetches and extracts submissions from arXiv CS catchup for a specific date.
    
    Args:
        target_date: Date in YYYY-MM-DD format
        collect_all: If True, collects all articles without trying to separate new vs replacements
    """
    base_url = "https://arxiv.org"
    try:
        date_obj = datetime.datetime.strptime(target_date, "%Y-%m-%d")
        url_date_str = date_obj.strftime("%Y-%m-%d")
        target_url = f"https://arxiv.org/catchup/cs/{url_date_str}"
        print(f"Targeting URL: {target_url}")
    except ValueError:
        print("Error: Invalid date format. Please use YYYY-MM-DD.")
        return []

    # --- Setup Extraction Strategies ---
    strategy_dd = JsonCssExtractionStrategy(schema_dd, verbose=False)
    strategy_dt = JsonCssExtractionStrategy(schema_dt, verbose=False)

    # --- Setup Crawler Configurations ---
    config_dd = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, extraction_strategy=strategy_dd)
    config_dt = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, extraction_strategy=strategy_dt)

    # --- Run Crawlers ---
    dd_data_list = []
    dt_data_list = []

    try:
        async with AsyncWebCrawler(verbose=False) as crawler:
            print("Running crawler for <dd> elements...")
            result_dd = await crawler.arun(url=target_url, config=config_dd)
            if result_dd.success and result_dd.extracted_content:
                try:
                    dd_data_list = json.loads(result_dd.extracted_content)
                    print(f"Successfully extracted {len(dd_data_list)} <dd> entries.")
                except json.JSONDecodeError:
                    print(f"Failed to parse <dd> JSON. First 200 chars: {result_dd.extracted_content[:200]}")
                    return []
            else:
                print(f"Crawl for <dd> failed or no content. Error: {result_dd.error_message}")
                return [] # Stop if we can't get <dd> data

            print("Running crawler for <dt> elements...")
            result_dt = await crawler.arun(url=target_url, config=config_dt)
            if result_dt.success and result_dt.extracted_content:
                try:
                    dt_data_list = json.loads(result_dt.extracted_content)
                    print(f"Successfully extracted {len(dt_data_list)} <dt> entries.")
                except json.JSONDecodeError:
                    print(f"Failed to parse <dt> JSON. First 200 chars: {result_dt.extracted_content[:200]}")
                    return []
            else:
                print(f"Crawl for <dt> failed or no content. Error: {result_dt.error_message}")
                return []

        # --- Merge and Post-process ---
        if len(dt_data_list) != len(dd_data_list):
            print(f"Warning: Mismatch between number of <dt> ({len(dt_data_list)}) and <dd> ({len(dd_data_list)}) elements. Merging based on minimum.")

        all_processed_articles = process_and_merge_articles(dt_data_list, dd_data_list, base_url)
        
        if not all_processed_articles:
            print("Warning: No articles were successfully merged and processed.")
            return []
            
        # Instead of filtering for "new submissions" vs "replacements", return all articles 
        # This avoids the premature stopping issue
        print(f"Collected {len(all_processed_articles)} total articles.")
        return all_processed_articles

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        # Add more context if possible
        if 'result_dd' in locals() and hasattr(result_dd, 'extracted_content'):
            print(f"DD content hint: {result_dd.extracted_content[:500] if result_dd.extracted_content else 'None'}")
        if 'result_dt' in locals() and hasattr(result_dt, 'extracted_content'):
            print(f"DT content hint: {result_dt.extracted_content[:500] if result_dt.extracted_content else 'None'}")
        return []
    except Exception as e:
        print(f"An error occurred during processing: {e}")
        traceback.print_exc()
        return []

# --- Post-processing Function to Categorize Articles ---
def categorize_articles(articles):
    """
    This function attempts to categorize articles into new submissions and replacements.
    It uses a heuristic based on journal references and sequences.
    """
    new_submissions = []
    cross_listings = []
    replacements = []
    
    # Simple heuristic: articles with journal references are likely replacements
    # You might need to refine this based on actual ArXiv structure
    for article in articles:
        journal_ref = article.get('journal_ref')
        # Advanced logic could go here
        # For now, just assume all are new submissions
        new_submissions.append(article)
    
    return {
        'new_submissions': new_submissions,
        'cross_listings': cross_listings,
        'replacements': replacements
    }

# --- Example Usage ---
async def main():
    target_date = "2025-04-01" # Date from sample HTML
    print(f"Attempting to fetch CS submissions announced on {target_date}...")

    # Get all submissions without filtering
    all_submissions = await get_arxiv_new_submissions(target_date, collect_all=True)

    if all_submissions:
        print(f"\n--- Extracted {len(all_submissions)} Total Submissions ---")
        
        # Optional: Categorize articles (if you implement this)
        # categorized = categorize_articles(all_submissions)
        # print(f"Categorized: {len(categorized['new_submissions'])} new, "
        #      f"{len(categorized['cross_listings'])} cross-listed, "
        #      f"{len(categorized['replacements'])} replacements")
        
        print("\nFirst 3 entries:")
        print(json.dumps(all_submissions[:3], indent=2, ensure_ascii=False))
        
        if len(all_submissions) > 3:
            print("\nLast 3 entries:")
            print(json.dumps(all_submissions[-3:], indent=2, ensure_ascii=False))

        output_filename = f"arxiv_cs_submissions_{target_date}.json"
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(all_submissions, f, ensure_ascii=False, indent=2)
        print(f"\nFull data saved to {output_filename}")
        
        # If you want to save categorized data
        # for category, data in categorized.items():
        #     if data:
        #         cat_filename = f"arxiv_cs_{category}_{target_date}.json"
        #         with open(cat_filename, 'w', encoding='utf-8') as f:
        #             json.dump(data, f, ensure_ascii=False, indent=2)
        #         print(f"Saved {len(data)} {category} to {cat_filename}")
    else:
        print("\nNo submissions data was extracted.")

# --- Run the main function ---
if __name__ == "__main__":
    asyncio.run(main())
else:
    # For Jupyter notebook environment
    await main()

## Step 3:  Find Relevent Articles

In [None]:
# Import the necessary functions from agent.py
import sys
import os
import asyncio
import json
from dotenv import load_dotenv

# Add the current directory to the path so we can import agent.py
sys.path.append(os.getcwd())

# Import the rank_articles function from agent.py
from agent import rank_articles

# Load environment variables
load_dotenv()

# Define user information for article ranking
user_info = {
    "name": "Dr. Jane Smith",  # Replace with your name
    "title": "Computer Science Professor",  # Replace with your title
    "goals": "I'm researching new approaches to natural language processing and looking for papers on transformer architectures and their applications."  # Replace with your research interests
}

# Function to run the article ranking
async def run_article_ranking(articles, top_n=5):
    """Rank articles based on user information."""
    print(f"Ranking {len(articles)} articles for relevance to user's interests...")
    
    # Call the rank_articles function from agent.py
    articles_ranked = await rank_articles(user_info, articles, top_n=top_n)
    
    if not articles_ranked:
        print("Failed to rank articles")
        return
    
    # Print results
    print(f"\nTop {len(articles_ranked)} Relevant Papers:")
    print("=" * 80)
    for i, article in enumerate(articles_ranked, 1):
        print(f"\n{i}. {article.title}")
        print(f"   Authors: {', '.join(article.authors)}")
        print(f"   Subject: {article.subject}")
        print(f"   Relevance Score: {article.relevance_score}/100")
        print(f"   Reasoning: {article.reasoning}")
        print(f"   URL: {article.abstract_url}")
        print("-" * 80)
    
    return articles_ranked

# Example usage with the articles from the previous cell
# This assumes you have already run the get_arxiv_new_submissions function
# and have the articles in a variable called 'all_submissions'

# Uncomment and modify the following code to run the ranking:
target_date = "2025-04-01"
all_submissions = await get_arxiv_new_submissions(target_date, collect_all=True)
if all_submissions:
    ranked_articles = await run_article_ranking(all_submissions, top_n=5)
else:
    print("No submissions data was extracted.")

## Read and Summarize Every Article