In [1]:
from services import pdf_parser

In [2]:
arxiv_url = "https://arxiv.org/abs/2601.06953"

In [3]:
pdf_content = await pdf_parser.download_pdf(arxiv_url)

In [4]:
markdown_content = pdf_parser.parse_pdf_to_markdown(pdf_content)

In [6]:
from openai import OpenAI


In [7]:
client = OpenAI()

In [8]:
from pydantic import BaseModel

In [9]:
class Step(BaseModel):
    explanation: str
    output: str

class MathReasoning(BaseModel):
    steps: list[Step]
    final_answer: str

In [10]:
response = client.responses.parse(
    model="gpt-4o-2024-08-06",
    input=[
        {
            "role": "system",
            "content": "You are a helpful math tutor. Guide the user through the solution step by step.",
        },
        {"role": "user", "content": "how can I solve 8x + 7 = -23"},
    ],
    text_format=MathReasoning,
)

math_reasoning = response.output_parsed

In [11]:
math_reasoning

MathReasoning(steps=[Step(explanation='Start with the equation 8x + 7 = -23.', output='8x + 7 = -23'), Step(explanation='Subtract 7 from both sides of the equation to isolate the term with x on one side.', output='8x + 7 - 7 = -23 - 7'), Step(explanation='Simplify both sides.', output='8x = -30'), Step(explanation='Divide both sides by 8 to solve for x.', output='x = -30/8'), Step(explanation='Simplify the fraction -30/8 by dividing both the numerator and the denominator by their greatest common divisor, which is 2.', output='x = -15/4'), Step(explanation='Convert the answer into a decimal, if needed.', output='x = -3.75')], final_answer='x = -15/4 \\, \\text{or} \\, -3.75')

In [None]:
prompt = f"""You are an expert research paper analyst. Please analyze the following research paper and provide a structured analysis.

Extract the following information:

1. **Paper Title**: The exact title of the paper

2. **Summary**: A comprehensive summary organized into these sections:
   - main_contribution: What is the key innovation or finding?
   - methodology: What approach or methods were used?
   - key_results: What were the main findings or outcomes?
   - significance: Why is this work important?
   - limitations: Any notable limitations or future work mentioned?

3. **Benchmarks**: Extract ALL quantitative performance metrics and benchmark results mentioned in the paper. For each benchmark, provide:
   - name: The name of the benchmark/dataset (e.g., "ImageNet", "GLUE", "SQuAD", "COCO")
   - score: The numerical result achieved (e.g., "88.5%", "76.3", "SOTA")
   - metric: The evaluation metric used (e.g., "Accuracy", "F1-Score", "BLEU", "mAP", "Top-1 Accuracy")

Important: 
- Extract ALL benchmarks mentioned, including baseline comparisons
- If no benchmarks are mentioned, return an empty list
- Be precise with numerical values
- Include the metric units

Paper Content:
{markdown_content}
"""

In [14]:
from services.models import *

In [17]:
response = client.responses.parse(
            model="gpt-5-mini",            
            input=[
                {"role": "system", "content": "You are an expert research paper analyst who provides clear, structured summaries and extracts quantitative benchmarks from academic papers."},
                {"role": "user", "content": prompt}
            ],
            text_format=PaperAnalysis            
        )
        
analysis = response.output_parsed

In [18]:
len(prompt)

132153

In [20]:
analysis.summary.main_contribution

'The paper proposes a fully synthetic pipeline (SynthSmith) to generate competition-level programming tasks, verified solutions, and high-quality test cases, and demonstrates that Code LLMs trained solely on these synthetic SFT and RL datasets (the X-Coder series) can achieve state-of-the-art competitive-programming performance without relying on real-world coding data.'

In [None]:
analysis_dict = {
            "paper_title": analysis.paper_title,
            "summary": {
                "main_contribution": analysis.summary.main_contribution,
                "methodology": analysis.summary.methodology,
                "key_results": analysis.summary.key_results,
                "significance": analysis.summary.significance,
                "limitations": analysis.summary.limitations
            },
            "benchmarks": [
                {
                    "name": b.name,
                    "score": b.score,
                    "metric": b.metric
                }
                for b in analysis.benchmarks
            ]
        }

In [23]:
!pip install rich

Collecting rich
  Using cached rich-14.2.0-py3-none-any.whl.metadata (18 kB)
Collecting markdown-it-py>=2.2.0 (from rich)
  Using cached markdown_it_py-4.0.0-py3-none-any.whl.metadata (7.3 kB)
Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich)
  Using cached mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)
Using cached rich-14.2.0-py3-none-any.whl (243 kB)
Using cached markdown_it_py-4.0.0-py3-none-any.whl (87 kB)
Using cached mdurl-0.1.2-py3-none-any.whl (10.0 kB)
Installing collected packages: mdurl, markdown-it-py, rich

   -------------------------- ------------- 2/3 [rich]
   -------------------------- ------------- 2/3 [rich]
   ---------------------------------------- 3/3 [rich]

Successfully installed markdown-it-py-4.0.0 mdurl-0.1.2 rich-14.2.0



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [24]:
print(analysis_dict)

{'paper_title': 'X-Coder: Advancing Competitive Programming With Fully Synthetic Tasks, Solutions, And Tests', 'summary': {'main_contribution': 'The paper proposes a fully synthetic pipeline (SynthSmith) to generate competition-level programming tasks, verified solutions, and high-quality test cases, and demonstrates that Code LLMs trained solely on these synthetic SFT and RL datasets (the X-Coder series) can achieve state-of-the-art competitive-programming performance without relying on real-world coding data.', 'methodology': '1) Data synthesis: SynthSmith extracts and evolves competition-oriented features from existing code snippets, composes them via a two-stage feature-selection → task-formulation process, and supports multiple task styles (Codeforces / LeetCode / AtCoder). 2) Solution & test generation: multiple candidate solutions are sampled from strong teacher LLMs; test inputs are generated via prompting and a tool-based library (CYaRon); both solutions and tests are cross-va