In [1]:
import requests
import time
import arxiv
import os
from tqdm import tqdm
import pandas as pd

def get_arxiv_pdf(arxiv_id):
    try:
        search = arxiv.Search(id_list=[arxiv_id])
        paper = next(search.results())
        return paper.pdf_url
    except Exception as e:
        print(f"\nError getting arXiv PDF URL for {arxiv_id}: {e}")
        return None

def retrieve_url(url, filepath):
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        return True
    except Exception as e:
        print(f"\nError downloading {url}: {e}")
        return False

def get_all_citations(paper_id):
    citations = []
    offset = 0
    limit = 1000
    total_citations = 0
    
    url = f"http://api.semanticscholar.org/graph/v1/paper/{paper_id}"
    params = {"fields": "citationCount"}
    try:
        response = requests.get(url, params=params)
        if response.status_code == 200:
            total_citations = response.json()['citationCount']
            print(f"Total citations to fetch: {total_citations}")
        else:
            print(f"Failed to get citation count: {response.status_code}")
            return citations
    except Exception as e:
        print(f"Error getting citation count: {e}")
        return citations
    
    with tqdm(total=total_citations, desc="Fetching citations") as pbar:
        while True:
            url = f"http://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations"
            params = {
                "fields": "citingPaper.title,citingPaper.externalIds,citingPaper.url",
                "limit": limit,
                "offset": offset
            }
            
            try:
                response = requests.get(url, params=params)
                if response.status_code == 200:
                    data = response.json()
                    batch = data.get('data', [])
                    if not batch:
                        break
                    
                    citations.extend(batch)
                    pbar.update(len(batch))
                    
                    offset += limit
                    time.sleep(1)
                else:
                    print(f"Error fetching citations: {response.status_code}")
                    break
            except Exception as e:
                print(f"Error in citation request: {e}")
                break
                
    return citations

paperId = "1b6e810ce0afd0dd093f789d2b2742d047e316d5"
citations = get_all_citations(paperId)

arxiv_citations = []
for citation in citations:
    if 'citingPaper' in citation:
        paper = citation['citingPaper']
        external_ids = paper.get('externalIds', {})
        if any(key.lower() == 'arxiv' for key in external_ids.keys()):
            arxiv_citations.append(paper)

print(f"\nTotal citations with arXiv IDs: {len(arxiv_citations)}")
print("\nFirst 5 papers to be downloaded:")
for i, paper in enumerate(arxiv_citations[:5]):
    print(f"{i+1}. {paper.get('title')}")

print("\nStarting downloads...")
if not os.path.exists("pdfs"):
    os.makedirs("pdfs")

for paper in tqdm(arxiv_citations, desc="Processing arXiv papers"):
    title = paper['title']
    arxiv_key = next(key for key in paper['externalIds'].keys() if key.lower() == 'arxiv')
    arxiv_id = paper['externalIds'][arxiv_key]
    
    safe_title = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()
    pdf_path = f"pdfs/{safe_title}.pdf"
    
    if os.path.exists(pdf_path):
        continue
        
    pdf_url = get_arxiv_pdf(arxiv_id)
    if pdf_url:
        if retrieve_url(pdf_url, pdf_path):
            continue  
        else:
            print(f"\nFailed to download: {title}")
    
    time.sleep(3)

Total citations to fetch: 6318


Fetching citations: 100%|██████████████████| 6318/6318 [00:25<00:00, 252.40it/s]



Total citations with arXiv IDs: 4895

First 5 papers to be downloaded:
1. Enhancing Table Recognition with Vision LLMs: A Benchmark and Neighbor-Guided Toolchain Reasoner
2. Efficiently Serving LLM Reasoning Programs with Certaindex
3. Do NOT Think That Much for 2+3=? On the Overthinking of o1-Like LLMs
4. Pushing the Envelope of Low-Bit LLM via Dynamic Error Compensation
5. Toward Adaptive Reasoning in Large Language Models with Thought Rollback

Starting downloads...


  paper = next(search.results())
Processing arXiv papers:  10%|█▌             | 502/4895 [05:29<41:40,  1.76it/s]


Error downloading http://arxiv.org/pdf/2410.07062v2: 404 Client Error: Not Found for url: http://arxiv.org/pdf/2410.07062v2

Failed to download: TinyEmo: Scaling down Emotional Reasoning via Metric Projection


Processing arXiv papers:  37%|█████         | 1789/4895 [18:54<27:51,  1.86it/s]


Error downloading http://arxiv.org/pdf/2405.02659v2: 404 Client Error: Not Found for url: http://arxiv.org/pdf/2405.02659v2

Failed to download: R4: Reinforced Retriever-Reorder-Responder for Retrieval-Augmented Large Language Models


Processing arXiv papers:  57%|███████▉      | 2777/4895 [31:35<34:25,  1.03it/s]


Error downloading http://arxiv.org/pdf/2312.08926v2: 404 Client Error: Not Found for url: http://arxiv.org/pdf/2312.08926v2

Failed to download: Modeling Complex Mathematical Reasoning via Large Language Model based MathAgent


Processing arXiv papers:  63%|████████▊     | 3098/4895 [38:37<15:57,  1.88it/s]


Error downloading http://arxiv.org/pdf/2310.18331v2: 404 Client Error: NOT FOUND for url: http://arxiv.org/pdf/2310.18331v2

Failed to download: AllTogether: Investigating the Efficacy of Spliced Prompt for Web Navigation using Large Language Models


Processing arXiv papers:  64%|█████████     | 3148/4895 [39:23<22:01,  1.32it/s]


Error downloading http://arxiv.org/pdf/2310.10698v2: 404 Client Error: Not Found for url: http://arxiv.org/pdf/2310.10698v2

Failed to download: Bridging Code Semantic and LLMs: Semantic Chain-of-Thought Prompting for Code Generation


Processing arXiv papers:  70%|█████████▊    | 3438/4895 [44:14<24:42,  1.02s/it]


Error downloading http://arxiv.org/pdf/2309.12481v2: 404 Client Error: NOT FOUND for url: http://arxiv.org/pdf/2309.12481v2

Failed to download: HANS, are you clever? Clever Hans Effect Analysis of Neural Systems


Processing arXiv papers:  71%|████████▌   | 3470/4895 [45:19<2:06:37,  5.33s/it]


Error downloading http://arxiv.org/pdf/2309.09749v3: 404 Client Error: Not Found for url: http://arxiv.org/pdf/2309.09749v3

Failed to download: Facilitating NSFW Text Detection in Open-Domain Dialogue Systems via Knowledge Distillation


Processing arXiv papers:  80%|███████████▏  | 3922/4895 [54:26<14:02,  1.15it/s]


Error downloading http://arxiv.org/pdf/2306.08997v2: 404 Client Error: NOT FOUND for url: http://arxiv.org/pdf/2306.08997v2

Failed to download: Exploring the MIT Mathematics and EECS Curriculum Using Large Language Models


Processing arXiv papers:  95%|███████████▍| 4667/4895 [1:10:46<05:45,  1.52s/it]


Error downloading http://arxiv.org/pdf/2306.07622v2: 404 Client Error: Not Found for url: http://arxiv.org/pdf/2306.07622v2

Failed to download: Human-like intuitive behavior and reasoning biases emerged in large language models but disappeared in ChatGPT


Processing arXiv papers: 100%|████████████| 4895/4895 [1:14:17<00:00,  1.10it/s]


In [2]:
from pathlib import Path
from PyPDF2 import PdfReader
import json
from tqdm import tqdm
import warnings
import logging

logger = logging.getLogger("PyPDF2")
logger.setLevel(logging.ERROR)

warnings.filterwarnings('ignore', message='.*FloatObject.*invalid.*')
warnings.filterwarnings('ignore', message='.*unknown widths.*')
warnings.filterwarnings('ignore', category=Warning, module='PyPDF2')

def clean_text(text):
    """Clean text of invalid Unicode characters"""
    if not text:
        return ""
    # Replace surrogate characters with a replacement character
    return text.encode('utf-16', 'surrogatepass').decode('utf-16', 'replace')

directory = "pdfs"
pdf_files = list(Path(directory).glob('*.pdf'))
pdf_text_dict = {}

# Create progress bar
for file in tqdm(pdf_files, desc="Processing PDFs"):
    try:
        reader = PdfReader(file)
        meta = reader.metadata
        
        # Initialize dictionary entry with metadata, cleaning each field
        pdf_text_dict[file.stem] = {
            "filename": file.name,
            "author": clean_text(meta.author) if meta.author else "",
            "creator": clean_text(meta.creator) if meta.creator else "",
            "subject": clean_text(meta.subject) if meta.subject else "",
            "title": clean_text(meta.title) if meta.title else "",
            "text": ""
        }
        
        # Extract text from all pages
        text_content = []
        for page in reader.pages:
            try:
                extracted_text = page.extract_text()
                if extracted_text:
                    text_content.append(clean_text(extracted_text))
            except Exception as page_error:
                print(f"\nError extracting text from page in {file}: {str(page_error)}")
                continue
            
        # Join all pages' text with newlines
        pdf_text_dict[file.stem]["text"] = "\n".join(text_content)
        
    except Exception as e:
        print(f"\nError processing {file}: {str(e)}")
        pdf_text_dict[file.stem] = {
            "filename": file.name,
            "error": str(e),
            "text": ""
        }

# Save to JSON file
output_path = "pdf_contents.json"
try:
    # First try with standard encoding
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(pdf_text_dict, f, ensure_ascii=False, indent=4)
    except UnicodeEncodeError:
        # If that fails, fall back to ensuring ASCII with escaping
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(pdf_text_dict, f, ensure_ascii=True, indent=4)
    print(f"\nSuccessfully saved data to {output_path}")
except Exception as e:
    print(f"\nError saving JSON file: {str(e)}")
    # Last resort: try to save with problematic characters removed
    try:
        cleaned_dict = {k: {
            key: str(value).encode('ascii', 'ignore').decode('ascii') 
            for key, value in v.items()
        } for k, v in pdf_text_dict.items()}
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(cleaned_dict, f, ensure_ascii=True, indent=4)
        print(f"Saved file with ASCII-only characters to {output_path}")
    except Exception as final_e:
        print(f"Final attempt to save failed: {str(final_e)}")

Processing PDFs: 100%|████████████████████| 4886/4886 [1:00:14<00:00,  1.35it/s]



Successfully saved data to pdf_contents.json


In [5]:
import json
import openai
from pathlib import Path
from tqdm import tqdm
import time
from openai import OpenAI
import tiktoken
from datetime import datetime

# Add your OpenAI API token here

def count_tokens(text):
    encoding = tiktoken.encoding_for_model("gpt-4o")
    return len(encoding.encode(text))

def truncate_text(text, max_tokens=14000):
    encoding = tiktoken.encoding_for_model("gpt-4o")
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        text = encoding.decode(tokens)
    return text

def create_prompt(paper_text):
    return f"""Analyze this academic paper and extract all benchmarks/datasets and models mentioned in experiments, evaluations, or comparisons.

Rules for extraction:
1. For benchmarks/datasets:
   - Include standard evaluation datasets (e.g., MNIST, ImageNet, SQuAD)
   - Include custom datasets if they're used for evaluation
   - Do NOT include training datasets unless they're also used for evaluation

2. For models:
   - Include baseline models used for comparison
   - Include proposed/novel models being evaluated
   - Include model variants tested in ablation studies
   - Do NOT include referenced models that weren't actually tested

Format your response as a JSON object with this exact structure:
{{
    "benchmarks": ["benchmark1", "benchmark2"],
    "models": ["model1", "model2"]
}}

Paper text:
{paper_text}"""

def analyze_paper_with_gpt(paper_text):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            truncated_text = truncate_text(paper_text)
            was_truncated = len(truncated_text) < len(paper_text)
            
            prompt = create_prompt(truncated_text)
            token_count = count_tokens(prompt)
            
            if token_count > 15000:
                print("Warning: Token count exceeds safe limit")
                return {
                    "benchmarks": [],
                    "models": [],
                    "error": "Token limit exceeded"
                }
                
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that analyzes academic papers and returns responses in JSON format only."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0,
                max_tokens=1000,
                response_format={"type": "json_object"}
            )

            result = response.choices[0].message.content
            
            # Add error checking for response content
            if not result.strip().startswith('{') or not result.strip().endswith('}'):
                raise json.JSONDecodeError("Invalid JSON format", result, 0)
                
            parsed_result = json.loads(result)
            
            if was_truncated:
                parsed_result["note"] = "Analysis based on truncated paper text"
                
            return parsed_result

        except json.JSONDecodeError as e:
            print(f"\nAttempt {attempt + 1}/{max_retries} - JSON parsing error: {str(e)}")
            print(f"Raw response: {result}")
            if attempt == max_retries - 1:
                return {
                    "benchmarks": [],
                    "models": [],
                    "error": f"Failed to parse GPT response as JSON after {max_retries} attempts: {str(e)}"
                }
            time.sleep(2)  # Wait before retry
            
        except Exception as e:
            print(f"\nAttempt {attempt + 1}/{max_retries} - Error: {str(e)}")
            if attempt == max_retries - 1:
                return {
                    "benchmarks": [],
                    "models": [],
                    "error": str(e)
                }
            time.sleep(2)  # Wait before retry

def save_progress(analysis_results, base_filename, iteration):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{base_filename}_iter{iteration}_{timestamp}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(analysis_results, f, ensure_ascii=False, indent=4)
    print(f"\nProgress saved to {filename}")

def main():
    input_file = "pdf_contents.json"
    output_base = "paper_analysis"

    with open(input_file, 'r', encoding='utf-8') as f:
        pdf_contents = json.load(f)

    analysis_results = {}
    
    # Convert items to list for tqdm
    items = list(pdf_contents.items())
    save_interval = 500  # Save every 500 papers
    
    for i, (paper_id, paper_data) in enumerate(tqdm(items[3000:], desc="Analyzing papers")):
        paper_text = paper_data.get('text', '')
        
        if not paper_text:
            print(f"\nSkipping {paper_id} - no text content")
            continue

        try:
            result = analyze_paper_with_gpt(paper_text)
            analysis_results[paper_id] = {
                "filename": paper_data['filename'],
                "analysis": result
            }
            
            # Save progress every save_interval papers
            if (i + 1) % save_interval == 0:
                save_progress(analysis_results, output_base, f"checkpoint_{i+1}")
                
            time.sleep(2)

        except Exception as e:
            print(f"\nError analyzing {paper_id}: {str(e)}")
            analysis_results[paper_id] = {
                "filename": paper_data['filename'],
                "error": str(e)
            }

    # Save final results
    save_progress(analysis_results, output_base, "final")

if __name__ == "__main__":
    main()

Analyzing papers:   3%|▎         | 48/1886 [04:20<2:44:35,  5.37s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresponding to di

Analyzing papers:   4%|▍         | 72/1886 [07:24<3:53:16,  7.72s/it]


Attempt 1/3 - JSON parsing error: Invalid JSON format: line 1 column 1 (char 0)
Raw response: {
    "benchmarks": [
        "AQA-Bench",
        "GuessNum",
        "DFS",
        "BFS",
        "Coin",
        "CaveDFS",
        "CaveBFS"
    ],
    "models": [
        "GPT-4",
        "Gemini",
        "GPT-3.5-Turbo",
        "GPT-4-Turbo",
        "Gemini-Pro",
        "


Analyzing papers:   9%|▊         | 161/1886 [20:59<2:14:33,  4.68s/it]


Attempt 1/3 - JSON parsing error: Invalid JSON format: line 1 column 1 (char 0)
Raw response: {
    "benchmarks": [
        "ACE04",
        "ACE05",
        "CoNLL03",
        "OntoNotes 5.0",
        "GENIA",
        "NYT",
        "ADE",
        "CoNLL04",
        "SciERC",
        "TACRED",
        "Re-TACRED",
        "TACREV",
        "SemEval"
    ],
    "models": [
        "GPT-4",
        "ChatGPT",
        "LLaMA",
        "Flan-T5",
        "CodeX",
        "GPT-NER",
        "Cp-NER",
        "LLMaAA",
        "PromptNER",
        "UniNER",
        "NAG-NER",
        "GNER",
        "NuNER",
        "MetaNER",
        "LinkNER",
        "SLCoLM",
        "ProgGen",
        "C-ICL",
        "VerifiNER",
        "ConsistNER",
        "GLiNER",
        "LTNER",
        "ToNER",
        "RT",
        "VANER",
        "RiVEG",
        "LLM-DA",
        "REBEL",
        "QA4RE",
        "GPT-RE",
        "STAR",
        "AugURE",
        "REPAL",
        "RAG4RE",
        "BART-

Analyzing papers:   9%|▉         | 176/1886 [34:42<5:37:17, 11.83s/it]


KeyboardInterrupt: 

In [11]:
import json
import glob

json_files = glob.glob('paper_analysis_itercheckpoint_*.json')
combined_data = {}

for file in json_files:
    with open(file) as f:
        data = json.load(f)
        combined_data.update(data)

with open('combined_analysis.json', 'w') as f:
    json.dump(combined_data, f, indent=2)


In [1]:
import pandas as pd

excel_file = "ensemble_works-new.xlsx"
sheet_names = pd.read_excel(excel_file, sheet_name=None).keys()
print(sheet_names)

dict_keys(['Properties', 'Models Used', 'Models Used Recently', 'Models Used Paste', 'Benchmarks Used', 'Benchmarks by Use'])


In [9]:
model_df = pd.read_excel(excel_file, sheet_name='Models Used')
model_df = model_df.iloc[:109, :-2]
# model_df.head()
# model_df.tail()

In [10]:
benchmark_df = pd.read_excel(excel_file, sheet_name='Benchmarks Used')
benchmark_df = benchmarks_df.iloc[:109, :-5]

In [12]:
annoted_papers = {}

for (index_one, model_row), (index_two, benchmark_row) in zip(model_df.iterrows(), benchmark_df.iterrows()):
    paper_title = model_row['Paper Titles']
    annoted_papers[paper_title] = {
        "filename": f"{paper_title}.pdf",
        "analysis": {
            "models": [
                model for model, value in model_row[2:].items()
                if value == 1.0
            ],
            "benchmarks": [
                benchmark for benchmark, value in benchmark_row[2:].items()
                if value == 1.0
            ],
            "note": "Analysis based on human review."
        }
    }

with open('annotated_papers.json', 'w', encoding='utf-8') as file:
    json.dump(annoted_papers, file, indent=2)

In [25]:
import json

def clean_title(title):
    # Remove common punctuation and convert to lowercase
    return title.lower().replace(":", "").replace("-", "").replace("  ", " ").strip()

with open('annotated_papers.json', 'r') as file1, open('pdf_contents.json', 'r') as file2:
    data1 = json.load(file1)
    data2 = json.load(file2)
    
    matches = 0
    
    # For each paper in first file
    for title1, paper1 in data1.items():
        title1_clean = clean_title(title1)
        
        # Check if this title exists in any title from second file
        for title2 in data2.keys():
            title2_clean = clean_title(title2)
            
            # Check if titles are effectively the same
            if title1_clean == title2_clean or \
               title1_clean in title2_clean and len(title1_clean) > 20:  # Length check to avoid short title false matches
                matches += 1
                print(f"\nMatch found:")
                print(f"File 1: {title1}")
                print(f"File 2: {title2}")
                break

    print(f"\nTotal papers in first file: {len(data1)}")
    print(f"Total papers in second file: {len(data2)}")
    print(f"Number of matching papers: {matches}")


Match found:
File 1: ReAct: Synergizing Reasoning and Acting in Language Models
File 2: ReAct Synergizing Reasoning and Acting in Language Models

Match found:
File 1: Tree of Thoughts: Deliberate Problem Solving with Large Language Models
File 2: Tree of Thoughts Deliberate Problem Solving with Large Language Models

Match found:
File 1: Boosted Prompt Ensembles for Large Language Models
File 2: Boosted Prompt Ensembles for Large Language Models

Match found:
File 1: Ask Me Anything: A simple strategy for prompting language models
File 2: Ask Me Anything A simple strategy for prompting language models

Match found:
File 1: Graph of Thoughts: Solving Elaborate Problems with Large Language Models
File 2: Graph of Thoughts Solving Elaborate Problems with Large Language Models

Match found:
File 1: More Agents Is All You Need
File 2: More Agents Is All You Need

Match found:
File 1: ReConcile: Round-Table Conference Improves Reasoning via Consensus among Diverse LLMs
File 2: ReConcile Ro

In [39]:
import json
import openai
from pathlib import Path
from tqdm import tqdm
import time
from openai import OpenAI
import tiktoken
from datetime import datetime

# Replace with your API key

def count_tokens(text):
    encoding = tiktoken.encoding_for_model("gpt-4o")
    return len(encoding.encode(text))

def truncate_text(text, max_tokens=14000):
    encoding = tiktoken.encoding_for_model("gpt-4o")
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        text = encoding.decode(tokens)
    return text

def create_prompt(paper_text):
    return f"""Analyze this academic paper and extract two specific types of information:

1. Benchmarks/Datasets used for evaluation:
   - Include standard evaluation datasets (e.g., MNIST, ImageNet, SQuAD)
   - Include custom datasets if they're used for evaluation
   - Do NOT include training datasets unless they're also used for evaluation

2. Base Language Models used in experiments:
   - Include specific model architectures and variants (e.g., GPT-4, LLaMA-70B, PaLM-540B), which includes parameter size
   - Do NOT include methods or techniques (e.g., don't include Chain-of-Thought, Self-Consistency, etc.)
   - For custom models, specify the base model they use (e.g., if a paper introduces "CustomBERT", note it uses BERT as base)

Format your response as a JSON object with this exact structure:
{{
    "benchmarks": ["benchmark1", "benchmark2"],
    "base_models": ["model1 (with size if specified)", "model2 (with size if specified)"]
}}

Paper text:
{paper_text}"""

def analyze_paper_with_gpt(paper_text):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            truncated_text = truncate_text(paper_text)
            was_truncated = len(truncated_text) < len(paper_text)
            
            prompt = create_prompt(truncated_text)
            token_count = count_tokens(prompt)
            
            if token_count > 15000:
                print("Warning: Token count exceeds safe limit")
                return {
                    "benchmarks": [],
                    "models": [],
                    "error": "Token limit exceeded"
                }
                
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a helpful graduate research assistant that analyzes academic papers and returns responses in JSON format only."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0,
                max_tokens=1000,
                response_format={"type": "json_object"}
            )

            result = response.choices[0].message.content
            
            # Add error checking for response content
            if not result.strip().startswith('{') or not result.strip().endswith('}'):
                raise json.JSONDecodeError("Invalid JSON format", result, 0)
                
            parsed_result = json.loads(result)
            
            if was_truncated:
                parsed_result["note"] = "Analysis based on truncated paper text"
                
            return parsed_result

        except json.JSONDecodeError as e:
            print(f"\nAttempt {attempt + 1}/{max_retries} - JSON parsing error: {str(e)}")
            print(f"Raw response: {result}")
            if attempt == max_retries - 1:
                return {
                    "benchmarks": [],
                    "models": [],
                    "error": f"Failed to parse GPT response as JSON after {max_retries} attempts: {str(e)}"
                }
            time.sleep(2)  # Wait before retry
            
        except Exception as e:
            print(f"\nAttempt {attempt + 1}/{max_retries} - Error: {str(e)}")
            if attempt == max_retries - 1:
                return {
                    "benchmarks": [],
                    "models": [],
                    "error": str(e)
                }
            time.sleep(2)  # Wait before retry

def save_progress(analysis_results, base_filename, iteration):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{base_filename}_iter{iteration}_{timestamp}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(analysis_results, f, ensure_ascii=False, indent=4)
    print(f"\nProgress saved to {filename}")

def clean_title(title):
    return title.lower().replace(":", "").replace("-", "").replace("  ", " ").strip()

def main():
    # First, load and find matching papers
    with open('annotated_papers.json', 'r') as file1, open('pdf_contents.json', 'r') as file2:
        data1 = json.load(file1)
        pdf_contents = json.load(file2)
        
        matched_papers = {}
        
        # Find matching papers
        for title1, paper1 in data1.items():
            title1_clean = clean_title(title1)
            
            for paper_id, paper_data in pdf_contents.items():
                title2 = paper_data['filename']
                title2_clean = clean_title(title2)
                
                if title1_clean == title2_clean or \
                   (title1_clean in title2_clean and len(title1_clean) > 20):
                    matched_papers[paper_id] = paper_data
                    print(f"Match found: {title1}")
                    break

    print(f"\nFound {len(matched_papers)} matching papers to analyze")
    
    # Now process only the matched papers
    analysis_results = {}
    save_interval = 10  # Reduced save interval since we have fewer papers
    
    # Convert items to list for tqdm
    items = list(matched_papers.items())
    
    for i, (paper_id, paper_data) in enumerate(tqdm(items, desc="Analyzing matched papers")):
        paper_text = paper_data.get('text', '')
        
        if not paper_text:
            print(f"\nSkipping {paper_id} - no text content")
            continue

        try:
            result = analyze_paper_with_gpt(paper_text)
            analysis_results[paper_id] = {
                "filename": paper_data['filename'],
                "analysis": result
            }
            
            # Save progress more frequently since we're processing fewer papers
            if (i + 1) % save_interval == 0:
                save_progress(analysis_results, "matched_papers_analysis", f"checkpoint_{i+1}")
                
            time.sleep(2)  # Rate limiting

        except Exception as e:
            print(f"\nError analyzing {paper_id}: {str(e)}")
            analysis_results[paper_id] = {
                "filename": paper_data['filename'],
                "error": str(e)
            }

    # Save final results
    save_progress(analysis_results, "matched_papers_analysis", "final")

if __name__ == "__main__":
    main()

Match found: ReAct: Synergizing Reasoning and Acting in Language Models
Match found: Tree of Thoughts: Deliberate Problem Solving with Large Language Models
Match found: Boosted Prompt Ensembles for Large Language Models
Match found: Ask Me Anything: A simple strategy for prompting language models
Match found: Graph of Thoughts: Solving Elaborate Problems with Large Language Models
Match found: More Agents Is All You Need
Match found: ReConcile: Round-Table Conference Improves Reasoning via Consensus among Diverse LLMs
Match found: Boosting of Thoughts: Trial-and-Error Problem Solving with Large Language Models
Match found: Fill in the Blank: Exploring and Enhancing LLM Capabilities for Backward Reasoning in Math Word Problems
Match found: InferFix: End-to-End Program Repair with LLMs
Match found: Revisit Input Perturbation Problems for LLMs: A Unified Robustness Evaluation Framework for Noisy Slot Filling Task
Match found: FreshLLMs: Refreshing Large Language Models with Search Engine

Analyzing matched papers:  18%|███▏              | 9/50 [01:05<07:29, 10.97s/it]


Progress saved to matched_papers_analysis_itercheckpoint_10_20250103_132347.json


Analyzing matched papers:  38%|██████▍          | 19/50 [01:47<02:18,  4.46s/it]


Progress saved to matched_papers_analysis_itercheckpoint_20_20250103_132430.json


Analyzing matched papers:  58%|█████████▊       | 29/50 [02:32<01:31,  4.36s/it]


Progress saved to matched_papers_analysis_itercheckpoint_30_20250103_132516.json


Analyzing matched papers:  78%|█████████████▎   | 39/50 [03:16<00:47,  4.32s/it]


Progress saved to matched_papers_analysis_itercheckpoint_40_20250103_132559.json


Analyzing matched papers:  98%|████████████████▋| 49/50 [04:13<00:05,  5.51s/it]


Progress saved to matched_papers_analysis_itercheckpoint_50_20250103_132658.json


Analyzing matched papers: 100%|█████████████████| 50/50 [04:20<00:00,  5.21s/it]


Progress saved to matched_papers_analysis_iterfinal_20250103_132700.json





In [4]:
import json
import openai
from pathlib import Path
from tqdm import tqdm
import time
from openai import OpenAI
import tiktoken
from datetime import datetime
import logging

# Disable OpenAI logging
logging.getLogger("openai").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)

# Replace with your API key

def count_tokens(text):
    encoding = tiktoken.encoding_for_model("gpt-4o")
    return len(encoding.encode(text))

def truncate_text(text, max_tokens=14000):
    encoding = tiktoken.encoding_for_model("gpt-4o")
    tokens = encoding.encode(text)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
        text = encoding.decode(tokens)
    return text

def create_prompt(paper_text):
    return f"""Analyze this academic paper and extract two specific types of information:

1. Benchmarks/Datasets used for evaluation:
   - Include standard evaluation datasets (e.g., MNIST, ImageNet, SQuAD)
   - Include custom datasets if they're used for evaluation
   - Do NOT include training datasets unless they're also used for evaluation

2. Base Language Models used in experiments:
   - Include specific model architectures and variants (e.g., GPT-4, LLaMA-70B, PaLM-540B), which includes parameter size
   - Do NOT include methods or techniques (e.g., don't include Chain-of-Thought, Self-Consistency, etc.)
   - For custom models, specify the base model they use (e.g., if a paper introduces "CustomBERT", note it uses BERT as base)

Format your response as a JSON object with this exact structure:
{{
    "benchmarks": ["benchmark1", "benchmark2"],
    "base_models": ["model1 (with size if specified)", "model2 (with size if specified)"]
}}

Paper text:
{paper_text}"""

def analyze_paper_with_gpt(paper_text):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            truncated_text = truncate_text(paper_text)
            was_truncated = len(truncated_text) < len(paper_text)
            
            prompt = create_prompt(truncated_text)
            token_count = count_tokens(prompt)
            
            if token_count > 15000:
                print("Warning: Token count exceeds safe limit")
                return {
                    "benchmarks": [],
                    "models": [],
                    "error": "Token limit exceeded"
                }
                
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a helpful graduate research assistant that analyzes academic papers and returns responses in JSON format only."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0,
                max_tokens=1000,
                response_format={"type": "json_object"}
            )

            result = response.choices[0].message.content
            
            # Add error checking for response content
            if not result.strip().startswith('{') or not result.strip().endswith('}'):
                raise json.JSONDecodeError("Invalid JSON format", result, 0)
                
            parsed_result = json.loads(result)
            
            if was_truncated:
                parsed_result["note"] = "Analysis based on truncated paper text"
                
            return parsed_result

        except json.JSONDecodeError as e:
            print(f"\nAttempt {attempt + 1}/{max_retries} - JSON parsing error: {str(e)}")
            print(f"Raw response: {result}")
            if attempt == max_retries - 1:
                return {
                    "benchmarks": [],
                    "models": [],
                    "error": f"Failed to parse GPT response as JSON after {max_retries} attempts: {str(e)}"
                }
            time.sleep(2)  # Wait before retry
            
        except Exception as e:
            print(f"\nAttempt {attempt + 1}/{max_retries} - Error: {str(e)}")
            if attempt == max_retries - 1:
                return {
                    "benchmarks": [],
                    "models": [],
                    "error": str(e)
                }
            time.sleep(2)  # Wait before retry

def save_progress(analysis_results, base_filename, iteration):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{base_filename}_iter{iteration}_{timestamp}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(analysis_results, f, ensure_ascii=False, indent=4)
    print(f"\nProgress saved to {filename}")

def main():
    # Load all papers from pdf_contents
    with open('pdf_contents.json', 'r') as file:
        pdf_contents = json.load(file)

    print(f"\nFound {len(pdf_contents)} papers to analyze")
    
    # Initialize results dictionary
    analysis_results = {}
    save_interval = 20  # Save every 20 papers
    
    # Convert items to list for tqdm
    items = list(pdf_contents.items())
    
    for i, (paper_id, paper_data) in enumerate(tqdm(items, desc="Analyzing papers")):
        paper_text = paper_data.get('text', '')
        
        if not paper_text:
            print(f"\nSkipping {paper_id} - no text content")
            continue

        try:
            result = analyze_paper_with_gpt(paper_text)
            analysis_results[paper_id] = {
                "filename": paper_data['filename'],
                "analysis": result
            }
            
            # Save progress periodically
            if (i + 1) % save_interval == 0:
                save_progress(analysis_results, "papers_analysis", f"checkpoint_{i+1}")
                
            time.sleep(2)  # Rate limiting

        except Exception as e:
            print(f"\nError analyzing {paper_id}: {str(e)}")
            analysis_results[paper_id] = {
                "filename": paper_data['filename'],
                "error": str(e)
            }

    # Save final results
    save_progress(analysis_results, "papers_analysis", "final")

if __name__ == "__main__":
    main()


Found 4886 papers to analyze


Analyzing papers:   0%|                     | 19/4886 [01:22<6:03:38,  4.48s/it]


Progress saved to papers_analysis_itercheckpoint_20_20250107_140550.json


Analyzing papers:   1%|                     | 25/4886 [01:55<6:22:50,  4.73s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresponding to di

Analyzing papers:   1%|▏                    | 39/4886 [03:06<7:45:07,  5.76s/it]


Progress saved to papers_analysis_itercheckpoint_40_20250107_140727.json


Analyzing papers:   1%|▎                    | 59/4886 [04:42<6:00:59,  4.49s/it]


Progress saved to papers_analysis_itercheckpoint_60_20250107_140904.json


Analyzing papers:   2%|▎                    | 79/4886 [06:48<9:12:03,  6.89s/it]


Progress saved to papers_analysis_itercheckpoint_80_20250107_141110.json


Analyzing papers:   2%|▍                    | 99/4886 [08:16<5:38:43,  4.25s/it]


Progress saved to papers_analysis_itercheckpoint_100_20250107_141237.json


Analyzing papers:   2%|▍                   | 119/4886 [10:10<5:45:55,  4.35s/it]


Progress saved to papers_analysis_itercheckpoint_120_20250107_141432.json


Analyzing papers:   3%|▌                   | 139/4886 [11:40<5:31:01,  4.18s/it]


Progress saved to papers_analysis_itercheckpoint_140_20250107_141602.json


Analyzing papers:   3%|▋                   | 159/4886 [13:22<5:53:46,  4.49s/it]


Progress saved to papers_analysis_itercheckpoint_160_20250107_141745.json


Analyzing papers:   4%|▋                   | 179/4886 [15:27<8:34:39,  6.56s/it]


Progress saved to papers_analysis_itercheckpoint_180_20250107_141949.json


Analyzing papers:   4%|▊                   | 199/4886 [17:10<6:00:34,  4.62s/it]


Progress saved to papers_analysis_itercheckpoint_200_20250107_142133.json


Analyzing papers:   4%|▉                   | 219/4886 [19:05<8:33:10,  6.60s/it]


Progress saved to papers_analysis_itercheckpoint_220_20250107_142327.json


Analyzing papers:   5%|▉                   | 239/4886 [20:40<6:26:20,  4.99s/it]


Progress saved to papers_analysis_itercheckpoint_240_20250107_142502.json


Analyzing papers:   5%|█                   | 259/4886 [22:21<6:42:11,  5.22s/it]


Progress saved to papers_analysis_itercheckpoint_260_20250107_142643.json


Analyzing papers:   6%|█▏                  | 279/4886 [23:54<7:07:05,  5.56s/it]


Progress saved to papers_analysis_itercheckpoint_280_20250107_142817.json


Analyzing papers:   6%|█▏                  | 299/4886 [25:30<7:27:45,  5.86s/it]


Progress saved to papers_analysis_itercheckpoint_300_20250107_142952.json


Analyzing papers:   7%|█▎                  | 319/4886 [27:11<6:49:04,  5.37s/it]


Progress saved to papers_analysis_itercheckpoint_320_20250107_143133.json


Analyzing papers:   7%|█▍                  | 339/4886 [29:11<6:34:46,  5.21s/it]


Progress saved to papers_analysis_itercheckpoint_340_20250107_143336.json


Analyzing papers:   7%|█▍                  | 359/4886 [30:48<5:15:36,  4.18s/it]


Progress saved to papers_analysis_itercheckpoint_360_20250107_143509.json


Analyzing papers:   8%|█▌                  | 379/4886 [32:25<6:58:32,  5.57s/it]


Progress saved to papers_analysis_itercheckpoint_380_20250107_143647.json


Analyzing papers:   8%|█▋                  | 399/4886 [33:47<5:31:18,  4.43s/it]


Progress saved to papers_analysis_itercheckpoint_400_20250107_143810.json


Analyzing papers:   9%|█▋                  | 419/4886 [35:25<5:47:17,  4.66s/it]


Progress saved to papers_analysis_itercheckpoint_420_20250107_143948.json


Analyzing papers:   9%|█▊                  | 439/4886 [37:17<9:08:40,  7.40s/it]


Progress saved to papers_analysis_itercheckpoint_440_20250107_144139.json


Analyzing papers:   9%|█▉                  | 459/4886 [38:45<5:21:46,  4.36s/it]


Progress saved to papers_analysis_itercheckpoint_460_20250107_144309.json


Analyzing papers:  10%|█▉                  | 479/4886 [40:41<7:09:51,  5.85s/it]


Progress saved to papers_analysis_itercheckpoint_480_20250107_144503.json


Analyzing papers:  10%|██                  | 499/4886 [42:25<6:53:40,  5.66s/it]


Progress saved to papers_analysis_itercheckpoint_500_20250107_144648.json


Analyzing papers:  11%|██                  | 519/4886 [43:59<5:28:40,  4.52s/it]


Progress saved to papers_analysis_itercheckpoint_520_20250107_144821.json


Analyzing papers:  11%|██▏                 | 539/4886 [45:29<5:08:45,  4.26s/it]


Progress saved to papers_analysis_itercheckpoint_540_20250107_144951.json


Analyzing papers:  11%|██▎                 | 559/4886 [47:04<5:54:32,  4.92s/it]


Progress saved to papers_analysis_itercheckpoint_560_20250107_145126.json


Analyzing papers:  12%|██▎                 | 579/4886 [48:49<4:49:02,  4.03s/it]


Progress saved to papers_analysis_itercheckpoint_580_20250107_145314.json


Analyzing papers:  12%|██▍                 | 599/4886 [50:25<4:43:43,  3.97s/it]


Progress saved to papers_analysis_itercheckpoint_600_20250107_145447.json


Analyzing papers:  13%|██▌                 | 619/4886 [52:07<6:38:30,  5.60s/it]


Progress saved to papers_analysis_itercheckpoint_620_20250107_145629.json


Analyzing papers:  13%|██▌                 | 639/4886 [53:42<5:55:24,  5.02s/it]


Progress saved to papers_analysis_itercheckpoint_640_20250107_145803.json


Analyzing papers:  13%|██▋                 | 659/4886 [55:16<5:17:01,  4.50s/it]


Progress saved to papers_analysis_itercheckpoint_660_20250107_145938.json


Analyzing papers:  14%|██▊                 | 679/4886 [56:55<4:50:18,  4.14s/it]


Progress saved to papers_analysis_itercheckpoint_680_20250107_150117.json


Analyzing papers:  14%|██▊                 | 693/4886 [57:55<5:09:01,  4.42s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresponding to di

Analyzing papers:  14%|██▊                 | 699/4886 [58:30<7:42:10,  6.62s/it]


Progress saved to papers_analysis_itercheckpoint_700_20250107_150302.json


Analyzing papers:  15%|██▋               | 719/4886 [1:00:22<6:23:34,  5.52s/it]


Progress saved to papers_analysis_itercheckpoint_720_20250107_150447.json



Analyzing papers:  15%|██▋               | 720/4886 [1:00:29<7:09:10,  6.18s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresponding to di

Analyzing papers:  15%|██▋               | 739/4886 [1:02:06<5:52:56,  5.11s/it]


Progress saved to papers_analysis_itercheckpoint_740_20250107_150627.json


Analyzing papers:  16%|██▊               | 759/4886 [1:03:45<5:26:31,  4.75s/it]


Progress saved to papers_analysis_itercheckpoint_760_20250107_150808.json


Analyzing papers:  16%|██▊               | 779/4886 [1:05:32<5:46:35,  5.06s/it]


Progress saved to papers_analysis_itercheckpoint_780_20250107_150953.json


Analyzing papers:  16%|██▉               | 799/4886 [1:07:19<5:50:48,  5.15s/it]


Progress saved to papers_analysis_itercheckpoint_800_20250107_151141.json


Analyzing papers:  17%|███               | 819/4886 [1:09:16<5:39:12,  5.00s/it]


Progress saved to papers_analysis_itercheckpoint_820_20250107_151338.json


Analyzing papers:  17%|███               | 827/4886 [1:09:53<5:39:50,  5.02s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endofprompt|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endofprompt|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endofprompt|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endofprompt|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endofprompt|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endofprompt|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresp

Analyzing papers:  17%|███               | 839/4886 [1:11:20<7:19:12,  6.51s/it]


Progress saved to papers_analysis_itercheckpoint_840_20250107_151541.json


Analyzing papers:  18%|███▏              | 859/4886 [1:13:02<4:43:06,  4.22s/it]


Progress saved to papers_analysis_itercheckpoint_860_20250107_151725.json


Analyzing papers:  18%|███▏              | 879/4886 [1:14:57<5:25:30,  4.87s/it]


Progress saved to papers_analysis_itercheckpoint_880_20250107_151920.json


Analyzing papers:  18%|███▎              | 899/4886 [1:16:40<6:02:19,  5.45s/it]


Progress saved to papers_analysis_itercheckpoint_900_20250107_152103.json


Analyzing papers:  19%|███▍              | 919/4886 [1:18:20<4:18:20,  3.91s/it]


Progress saved to papers_analysis_itercheckpoint_920_20250107_152242.json


Analyzing papers:  19%|███▍              | 939/4886 [1:19:46<4:34:12,  4.17s/it]


Progress saved to papers_analysis_itercheckpoint_940_20250107_152408.json


Analyzing papers:  20%|███▌              | 959/4886 [1:21:17<5:31:24,  5.06s/it]


Progress saved to papers_analysis_itercheckpoint_960_20250107_152539.json


Analyzing papers:  20%|███▌              | 979/4886 [1:22:50<4:20:58,  4.01s/it]


Progress saved to papers_analysis_itercheckpoint_980_20250107_152711.json


Analyzing papers:  20%|███▋              | 999/4886 [1:24:22<4:31:51,  4.20s/it]


Progress saved to papers_analysis_itercheckpoint_1000_20250107_152844.json


Analyzing papers:  21%|███▌             | 1019/4886 [1:26:20<8:15:56,  7.69s/it]


Progress saved to papers_analysis_itercheckpoint_1020_20250107_153043.json


Analyzing papers:  21%|███▌             | 1039/4886 [1:28:01<5:29:46,  5.14s/it]


Progress saved to papers_analysis_itercheckpoint_1040_20250107_153227.json


Analyzing papers:  22%|███▋             | 1059/4886 [1:29:36<4:49:08,  4.53s/it]


Progress saved to papers_analysis_itercheckpoint_1060_20250107_153357.json


Analyzing papers:  22%|███▊             | 1079/4886 [1:31:20<6:45:03,  6.38s/it]


Progress saved to papers_analysis_itercheckpoint_1080_20250107_153542.json


Analyzing papers:  22%|███▊             | 1099/4886 [1:33:10<5:07:55,  4.88s/it]


Progress saved to papers_analysis_itercheckpoint_1100_20250107_153733.json


Analyzing papers:  23%|███▉             | 1119/4886 [1:34:49<5:37:13,  5.37s/it]


Progress saved to papers_analysis_itercheckpoint_1120_20250107_153916.json


Analyzing papers:  23%|███▉             | 1139/4886 [1:36:25<4:17:48,  4.13s/it]


Progress saved to papers_analysis_itercheckpoint_1140_20250107_154047.json


Analyzing papers:  24%|████             | 1159/4886 [1:37:53<4:57:22,  4.79s/it]


Progress saved to papers_analysis_itercheckpoint_1160_20250107_154215.json


Analyzing papers:  24%|████             | 1179/4886 [1:39:28<4:45:59,  4.63s/it]


Progress saved to papers_analysis_itercheckpoint_1180_20250107_154350.json


Analyzing papers:  25%|████▏            | 1199/4886 [1:40:56<4:20:20,  4.24s/it]


Progress saved to papers_analysis_itercheckpoint_1200_20250107_154518.json


Analyzing papers:  25%|████▏            | 1219/4886 [1:42:28<4:39:55,  4.58s/it]


Progress saved to papers_analysis_itercheckpoint_1220_20250107_154652.json


Analyzing papers:  25%|████▎            | 1239/4886 [1:44:21<6:02:16,  5.96s/it]


Progress saved to papers_analysis_itercheckpoint_1240_20250107_154843.json


Analyzing papers:  26%|████▍            | 1259/4886 [1:45:48<4:15:06,  4.22s/it]


Progress saved to papers_analysis_itercheckpoint_1260_20250107_155009.json


Analyzing papers:  26%|████▍            | 1279/4886 [1:47:12<4:30:26,  4.50s/it]


Progress saved to papers_analysis_itercheckpoint_1280_20250107_155140.json


Analyzing papers:  27%|████▌            | 1299/4886 [1:48:49<4:13:25,  4.24s/it]


Progress saved to papers_analysis_itercheckpoint_1300_20250107_155311.json


Analyzing papers:  27%|████▌            | 1319/4886 [1:50:30<5:37:57,  5.68s/it]


Progress saved to papers_analysis_itercheckpoint_1320_20250107_155451.json


Analyzing papers:  27%|████▋            | 1335/4886 [1:51:40<4:40:03,  4.73s/it]


Attempt 1/3 - Error: 'NoneType' object has no attribute 'strip'


Analyzing papers:  27%|████▋            | 1339/4886 [1:51:59<4:22:09,  4.43s/it]


Progress saved to papers_analysis_itercheckpoint_1340_20250107_155621.json


Analyzing papers:  28%|████▋            | 1359/4886 [1:53:24<4:22:50,  4.47s/it]


Progress saved to papers_analysis_itercheckpoint_1360_20250107_155746.json


Analyzing papers:  28%|████▊            | 1379/4886 [1:54:57<5:22:31,  5.52s/it]


Progress saved to papers_analysis_itercheckpoint_1380_20250107_155919.json


Analyzing papers:  29%|████▊            | 1399/4886 [1:56:30<5:00:15,  5.17s/it]


Progress saved to papers_analysis_itercheckpoint_1400_20250107_160052.json


Analyzing papers:  29%|████▉            | 1419/4886 [1:58:05<4:39:47,  4.84s/it]


Progress saved to papers_analysis_itercheckpoint_1420_20250107_160226.json


Analyzing papers:  29%|█████            | 1439/4886 [1:59:43<3:53:48,  4.07s/it]


Progress saved to papers_analysis_itercheckpoint_1440_20250107_160405.json


Analyzing papers:  30%|█████            | 1459/4886 [2:01:06<3:36:42,  3.79s/it]


Progress saved to papers_analysis_itercheckpoint_1460_20250107_160530.json


Analyzing papers:  30%|█████▏           | 1479/4886 [2:02:28<3:56:47,  4.17s/it]


Progress saved to papers_analysis_itercheckpoint_1480_20250107_160652.json


Analyzing papers:  31%|█████▏           | 1499/4886 [2:03:57<4:06:37,  4.37s/it]


Progress saved to papers_analysis_itercheckpoint_1500_20250107_160819.json


Analyzing papers:  31%|█████▎           | 1519/4886 [2:05:20<3:44:39,  4.00s/it]


Progress saved to papers_analysis_itercheckpoint_1520_20250107_160943.json


Analyzing papers:  31%|█████▎           | 1539/4886 [2:06:48<3:56:02,  4.23s/it]


Progress saved to papers_analysis_itercheckpoint_1540_20250107_161113.json


Analyzing papers:  32%|█████▍           | 1559/4886 [2:08:23<3:50:13,  4.15s/it]


Progress saved to papers_analysis_itercheckpoint_1560_20250107_161244.json


Analyzing papers:  32%|█████▍           | 1579/4886 [2:09:47<3:31:48,  3.84s/it]


Progress saved to papers_analysis_itercheckpoint_1580_20250107_161409.json


Analyzing papers:  33%|█████▌           | 1599/4886 [2:11:13<3:34:24,  3.91s/it]


Progress saved to papers_analysis_itercheckpoint_1600_20250107_161535.json


Analyzing papers:  33%|█████▋           | 1619/4886 [2:12:58<4:57:25,  5.46s/it]


Progress saved to papers_analysis_itercheckpoint_1620_20250107_161719.json


Analyzing papers:  34%|█████▋           | 1639/4886 [2:14:33<4:57:05,  5.49s/it]


Progress saved to papers_analysis_itercheckpoint_1640_20250107_161855.json


Analyzing papers:  34%|█████▊           | 1659/4886 [2:16:14<6:47:34,  7.58s/it]


Progress saved to papers_analysis_itercheckpoint_1660_20250107_162036.json


Analyzing papers:  34%|█████▊           | 1679/4886 [2:17:44<3:58:26,  4.46s/it]


Progress saved to papers_analysis_itercheckpoint_1680_20250107_162206.json


Analyzing papers:  35%|█████▉           | 1699/4886 [2:19:11<3:56:04,  4.44s/it]


Progress saved to papers_analysis_itercheckpoint_1700_20250107_162332.json


Analyzing papers:  35%|█████▉           | 1719/4886 [2:20:36<3:24:39,  3.88s/it]


Progress saved to papers_analysis_itercheckpoint_1720_20250107_162459.json


Analyzing papers:  36%|██████           | 1739/4886 [2:22:05<3:49:57,  4.38s/it]


Progress saved to papers_analysis_itercheckpoint_1740_20250107_162627.json


Analyzing papers:  36%|██████           | 1759/4886 [2:23:25<3:43:32,  4.29s/it]


Progress saved to papers_analysis_itercheckpoint_1760_20250107_162746.json


Analyzing papers:  36%|██████▏          | 1779/4886 [2:24:53<3:49:18,  4.43s/it]


Progress saved to papers_analysis_itercheckpoint_1780_20250107_162914.json


Analyzing papers:  37%|██████▎          | 1799/4886 [2:26:14<3:21:07,  3.91s/it]


Progress saved to papers_analysis_itercheckpoint_1800_20250107_163036.json


Analyzing papers:  37%|██████▎          | 1819/4886 [2:27:45<3:09:53,  3.71s/it]


Progress saved to papers_analysis_itercheckpoint_1820_20250107_163208.json


Analyzing papers:  38%|██████▍          | 1839/4886 [2:29:16<3:46:47,  4.47s/it]


Progress saved to papers_analysis_itercheckpoint_1840_20250107_163339.json


Analyzing papers:  38%|██████▍          | 1859/4886 [2:30:36<3:09:24,  3.75s/it]


Progress saved to papers_analysis_itercheckpoint_1860_20250107_163457.json


Analyzing papers:  38%|██████▌          | 1879/4886 [2:32:05<4:10:41,  5.00s/it]


Progress saved to papers_analysis_itercheckpoint_1880_20250107_163626.json


Analyzing papers:  39%|██████▌          | 1899/4886 [2:33:38<4:35:16,  5.53s/it]


Progress saved to papers_analysis_itercheckpoint_1900_20250107_163759.json


Analyzing papers:  39%|██████▋          | 1919/4886 [2:35:10<3:20:43,  4.06s/it]


Progress saved to papers_analysis_itercheckpoint_1920_20250107_163933.json


Analyzing papers:  40%|██████▋          | 1939/4886 [2:36:40<3:35:37,  4.39s/it]


Progress saved to papers_analysis_itercheckpoint_1940_20250107_164106.json


Analyzing papers:  40%|██████▊          | 1959/4886 [2:38:20<4:19:11,  5.31s/it]


Progress saved to papers_analysis_itercheckpoint_1960_20250107_164248.json


Analyzing papers:  41%|██████▉          | 1979/4886 [2:39:54<4:08:42,  5.13s/it]


Progress saved to papers_analysis_itercheckpoint_1980_20250107_164416.json


Analyzing papers:  41%|██████▉          | 1999/4886 [2:41:20<3:06:10,  3.87s/it]


Progress saved to papers_analysis_itercheckpoint_2000_20250107_164542.json


Analyzing papers:  41%|███████          | 2019/4886 [2:42:47<4:00:53,  5.04s/it]


Progress saved to papers_analysis_itercheckpoint_2020_20250107_164708.json


Analyzing papers:  42%|███████          | 2039/4886 [2:44:18<3:54:11,  4.94s/it]


Progress saved to papers_analysis_itercheckpoint_2040_20250107_164839.json


Analyzing papers:  42%|███████▏         | 2059/4886 [2:45:54<3:33:36,  4.53s/it]


Progress saved to papers_analysis_itercheckpoint_2060_20250107_165015.json


Analyzing papers:  43%|███████▏         | 2079/4886 [2:47:31<3:18:12,  4.24s/it]


Progress saved to papers_analysis_itercheckpoint_2080_20250107_165153.json


Analyzing papers:  43%|███████▎         | 2099/4886 [2:49:05<3:24:42,  4.41s/it]


Progress saved to papers_analysis_itercheckpoint_2100_20250107_165326.json


Analyzing papers:  43%|███████▎         | 2119/4886 [2:50:29<3:00:08,  3.91s/it]


Progress saved to papers_analysis_itercheckpoint_2120_20250107_165453.json


Analyzing papers:  44%|███████▍         | 2139/4886 [2:54:12<3:48:32,  4.99s/it]


Progress saved to papers_analysis_itercheckpoint_2140_20250107_165834.json


Analyzing papers:  44%|███████▌         | 2159/4886 [2:55:36<3:13:52,  4.27s/it]


Progress saved to papers_analysis_itercheckpoint_2160_20250107_165958.json


Analyzing papers:  45%|███████▌         | 2179/4886 [2:57:11<3:06:10,  4.13s/it]


Progress saved to papers_analysis_itercheckpoint_2180_20250107_170134.json


Analyzing papers:  45%|███████▋         | 2199/4886 [2:58:49<3:42:19,  4.96s/it]


Progress saved to papers_analysis_itercheckpoint_2200_20250107_170314.json


Analyzing papers:  45%|███████▋         | 2219/4886 [3:00:15<2:50:28,  3.84s/it]


Progress saved to papers_analysis_itercheckpoint_2220_20250107_170437.json


Analyzing papers:  46%|███████▊         | 2239/4886 [3:01:39<3:24:10,  4.63s/it]


Progress saved to papers_analysis_itercheckpoint_2240_20250107_170601.json


Analyzing papers:  46%|███████▊         | 2259/4886 [3:03:04<3:20:19,  4.58s/it]


Progress saved to papers_analysis_itercheckpoint_2260_20250107_170728.json


Analyzing papers:  47%|███████▉         | 2279/4886 [3:04:34<2:51:20,  3.94s/it]


Progress saved to papers_analysis_itercheckpoint_2280_20250107_170857.json


Analyzing papers:  47%|███████▉         | 2299/4886 [3:05:50<2:33:11,  3.55s/it]


Progress saved to papers_analysis_itercheckpoint_2300_20250107_171012.json


Analyzing papers:  47%|████████         | 2319/4886 [3:07:16<3:17:05,  4.61s/it]


Progress saved to papers_analysis_itercheckpoint_2320_20250107_171139.json


Analyzing papers:  48%|████████▏        | 2339/4886 [3:08:38<3:06:33,  4.39s/it]


Progress saved to papers_analysis_itercheckpoint_2340_20250107_171259.json


Analyzing papers:  48%|████████▏        | 2356/4886 [3:09:49<2:41:29,  3.83s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresponding to di

Analyzing papers:  48%|████████▏        | 2359/4886 [3:10:04<3:07:17,  4.45s/it]


Progress saved to papers_analysis_itercheckpoint_2360_20250107_171426.json


Analyzing papers:  49%|████████▎        | 2379/4886 [3:11:32<2:55:24,  4.20s/it]


Progress saved to papers_analysis_itercheckpoint_2380_20250107_171553.json


Analyzing papers:  49%|████████▎        | 2399/4886 [3:12:59<3:05:40,  4.48s/it]


Progress saved to papers_analysis_itercheckpoint_2400_20250107_171720.json


Analyzing papers:  50%|████████▍        | 2419/4886 [3:14:25<2:48:54,  4.11s/it]


Progress saved to papers_analysis_itercheckpoint_2420_20250107_171847.json


Analyzing papers:  50%|████████▍        | 2439/4886 [3:15:49<2:50:03,  4.17s/it]


Progress saved to papers_analysis_itercheckpoint_2440_20250107_172010.json


Analyzing papers:  50%|████████▌        | 2459/4886 [3:17:21<3:37:32,  5.38s/it]


Progress saved to papers_analysis_itercheckpoint_2460_20250107_172143.json


Analyzing papers:  51%|████████▋        | 2479/4886 [3:18:48<2:41:12,  4.02s/it]


Progress saved to papers_analysis_itercheckpoint_2480_20250107_172310.json


Analyzing papers:  51%|████████▋        | 2499/4886 [3:20:20<2:57:32,  4.46s/it]


Progress saved to papers_analysis_itercheckpoint_2500_20250107_172442.json


Analyzing papers:  52%|████████▊        | 2519/4886 [3:21:52<2:40:01,  4.06s/it]


Progress saved to papers_analysis_itercheckpoint_2520_20250107_172613.json


Analyzing papers:  52%|████████▊        | 2539/4886 [3:23:27<2:53:47,  4.44s/it]


Progress saved to papers_analysis_itercheckpoint_2540_20250107_172750.json


Analyzing papers:  52%|████████▉        | 2559/4886 [3:24:54<2:58:11,  4.59s/it]


Progress saved to papers_analysis_itercheckpoint_2560_20250107_172915.json


Analyzing papers:  53%|████████▉        | 2579/4886 [3:26:24<2:49:48,  4.42s/it]


Progress saved to papers_analysis_itercheckpoint_2580_20250107_173045.json


Analyzing papers:  53%|█████████        | 2599/4886 [3:27:46<2:25:44,  3.82s/it]


Progress saved to papers_analysis_itercheckpoint_2600_20250107_173209.json


Analyzing papers:  54%|█████████        | 2619/4886 [3:29:20<2:57:10,  4.69s/it]


Progress saved to papers_analysis_itercheckpoint_2620_20250107_173342.json


Analyzing papers:  54%|█████████▏       | 2639/4886 [3:30:41<2:35:35,  4.15s/it]


Progress saved to papers_analysis_itercheckpoint_2640_20250107_173504.json


Analyzing papers:  54%|█████████▎       | 2659/4886 [3:32:34<6:10:49,  9.99s/it]


Progress saved to papers_analysis_itercheckpoint_2660_20250107_173656.json


Analyzing papers:  55%|█████████▎       | 2679/4886 [3:34:07<2:48:27,  4.58s/it]


Progress saved to papers_analysis_itercheckpoint_2680_20250107_173830.json


Analyzing papers:  55%|█████████▍       | 2699/4886 [3:35:32<2:26:04,  4.01s/it]


Progress saved to papers_analysis_itercheckpoint_2700_20250107_173953.json


Analyzing papers:  56%|█████████▍       | 2719/4886 [3:37:07<4:58:53,  8.28s/it]


Progress saved to papers_analysis_itercheckpoint_2720_20250107_174134.json


Analyzing papers:  56%|█████████▌       | 2739/4886 [3:38:49<2:40:50,  4.49s/it]


Progress saved to papers_analysis_itercheckpoint_2740_20250107_174310.json


Analyzing papers:  56%|█████████▌       | 2759/4886 [3:40:35<3:37:31,  6.14s/it]


Progress saved to papers_analysis_itercheckpoint_2760_20250107_174458.json


Analyzing papers:  57%|█████████▋       | 2779/4886 [3:41:50<2:04:30,  3.55s/it]


Progress saved to papers_analysis_itercheckpoint_2780_20250107_174612.json


Analyzing papers:  57%|█████████▋       | 2799/4886 [3:43:03<2:10:40,  3.76s/it]


Progress saved to papers_analysis_itercheckpoint_2800_20250107_174725.json


Analyzing papers:  58%|█████████▊       | 2819/4886 [3:44:13<1:54:42,  3.33s/it]


Progress saved to papers_analysis_itercheckpoint_2820_20250107_174835.json


Analyzing papers:  58%|█████████▉       | 2839/4886 [3:45:30<2:03:01,  3.61s/it]


Progress saved to papers_analysis_itercheckpoint_2840_20250107_174951.json


Analyzing papers:  59%|█████████▉       | 2859/4886 [3:46:47<2:27:24,  4.36s/it]


Progress saved to papers_analysis_itercheckpoint_2860_20250107_175109.json


Analyzing papers:  59%|██████████       | 2879/4886 [3:47:59<2:10:19,  3.90s/it]


Progress saved to papers_analysis_itercheckpoint_2880_20250107_175220.json


Analyzing papers:  59%|██████████       | 2899/4886 [3:49:12<1:50:05,  3.32s/it]


Progress saved to papers_analysis_itercheckpoint_2900_20250107_175333.json


Analyzing papers:  60%|██████████▏      | 2919/4886 [3:50:27<2:15:00,  4.12s/it]


Progress saved to papers_analysis_itercheckpoint_2920_20250107_175448.json


Analyzing papers:  60%|██████████▏      | 2925/4886 [3:50:50<2:05:10,  3.83s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresponding to di

Analyzing papers:  60%|██████████▏      | 2939/4886 [3:51:42<1:49:05,  3.36s/it]


Progress saved to papers_analysis_itercheckpoint_2940_20250107_175603.json


Analyzing papers:  61%|██████████▎      | 2959/4886 [3:52:56<2:00:06,  3.74s/it]


Progress saved to papers_analysis_itercheckpoint_2960_20250107_175717.json


Analyzing papers:  61%|██████████▎      | 2979/4886 [3:54:10<1:54:31,  3.60s/it]


Progress saved to papers_analysis_itercheckpoint_2980_20250107_175832.json


Analyzing papers:  61%|██████████▍      | 2999/4886 [3:55:28<1:57:07,  3.72s/it]


Progress saved to papers_analysis_itercheckpoint_3000_20250107_175949.json


Analyzing papers:  62%|██████████▌      | 3019/4886 [3:56:44<2:02:48,  3.95s/it]


Progress saved to papers_analysis_itercheckpoint_3020_20250107_180105.json


Analyzing papers:  62%|██████████▌      | 3039/4886 [3:57:57<1:48:49,  3.54s/it]


Progress saved to papers_analysis_itercheckpoint_3040_20250107_180219.json


Analyzing papers:  62%|██████████▌      | 3048/4886 [3:58:35<1:59:07,  3.89s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresponding to di

Analyzing papers:  63%|██████████▋      | 3059/4886 [3:59:20<1:52:11,  3.68s/it]


Progress saved to papers_analysis_itercheckpoint_3060_20250107_180340.json


Analyzing papers:  63%|██████████▋      | 3079/4886 [4:00:32<1:50:50,  3.68s/it]


Progress saved to papers_analysis_itercheckpoint_3080_20250107_180453.json


Analyzing papers:  63%|██████████▊      | 3099/4886 [4:01:45<1:55:31,  3.88s/it]


Progress saved to papers_analysis_itercheckpoint_3100_20250107_180606.json


Analyzing papers:  64%|██████████▊      | 3119/4886 [4:02:54<1:44:55,  3.56s/it]


Progress saved to papers_analysis_itercheckpoint_3120_20250107_180715.json


Analyzing papers:  64%|██████████▉      | 3139/4886 [4:04:07<1:56:36,  4.01s/it]


Progress saved to papers_analysis_itercheckpoint_3140_20250107_180828.json


Analyzing papers:  65%|██████████▉      | 3159/4886 [4:05:25<1:46:54,  3.71s/it]


Progress saved to papers_analysis_itercheckpoint_3160_20250107_180947.json


Analyzing papers:  65%|███████████      | 3179/4886 [4:06:41<1:47:07,  3.77s/it]


Progress saved to papers_analysis_itercheckpoint_3180_20250107_181102.json


Analyzing papers:  65%|███████████▏     | 3199/4886 [4:08:03<1:46:50,  3.80s/it]


Progress saved to papers_analysis_itercheckpoint_3200_20250107_181224.json


Analyzing papers:  66%|███████████▏     | 3219/4886 [4:09:17<1:36:29,  3.47s/it]


Progress saved to papers_analysis_itercheckpoint_3220_20250107_181339.json


Analyzing papers:  66%|███████████▎     | 3239/4886 [4:10:35<1:51:16,  4.05s/it]


Progress saved to papers_analysis_itercheckpoint_3240_20250107_181456.json


Analyzing papers:  67%|███████████▎     | 3259/4886 [4:11:53<1:40:24,  3.70s/it]


Progress saved to papers_analysis_itercheckpoint_3260_20250107_181615.json


Analyzing papers:  67%|███████████▍     | 3279/4886 [4:13:02<1:30:00,  3.36s/it]


Progress saved to papers_analysis_itercheckpoint_3280_20250107_181723.json


Analyzing papers:  68%|███████████▍     | 3299/4886 [4:14:19<1:32:05,  3.48s/it]


Progress saved to papers_analysis_itercheckpoint_3300_20250107_181845.json


Analyzing papers:  68%|███████████▌     | 3319/4886 [4:15:46<1:36:52,  3.71s/it]


Progress saved to papers_analysis_itercheckpoint_3320_20250107_182008.json


Analyzing papers:  68%|███████████▌     | 3339/4886 [4:17:00<1:46:11,  4.12s/it]


Progress saved to papers_analysis_itercheckpoint_3340_20250107_182126.json


Analyzing papers:  69%|███████████▋     | 3359/4886 [4:18:16<1:31:02,  3.58s/it]


Progress saved to papers_analysis_itercheckpoint_3360_20250107_182239.json


Analyzing papers:  69%|███████████▊     | 3379/4886 [4:19:29<1:30:32,  3.60s/it]


Progress saved to papers_analysis_itercheckpoint_3380_20250107_182350.json


Analyzing papers:  70%|███████████▊     | 3399/4886 [4:20:46<2:01:54,  4.92s/it]


Progress saved to papers_analysis_itercheckpoint_3400_20250107_182508.json


Analyzing papers:  70%|███████████▉     | 3419/4886 [4:21:57<1:33:42,  3.83s/it]


Progress saved to papers_analysis_itercheckpoint_3420_20250107_182618.json


Analyzing papers:  70%|███████████▉     | 3436/4886 [4:22:57<1:27:45,  3.63s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresponding to di

Analyzing papers:  70%|███████████▉     | 3439/4886 [4:23:11<1:41:49,  4.22s/it]


Progress saved to papers_analysis_itercheckpoint_3440_20250107_182732.json


Analyzing papers:  71%|████████████     | 3459/4886 [4:24:36<1:33:36,  3.94s/it]


Progress saved to papers_analysis_itercheckpoint_3460_20250107_182857.json


Analyzing papers:  71%|████████████     | 3479/4886 [4:25:50<1:27:04,  3.71s/it]


Progress saved to papers_analysis_itercheckpoint_3480_20250107_183011.json


Analyzing papers:  72%|████████████▏    | 3499/4886 [4:27:02<1:25:11,  3.69s/it]


Progress saved to papers_analysis_itercheckpoint_3500_20250107_183123.json


Analyzing papers:  72%|████████████▏    | 3519/4886 [4:28:24<2:09:40,  5.69s/it]


Progress saved to papers_analysis_itercheckpoint_3520_20250107_183245.json


Analyzing papers:  72%|████████████▎    | 3539/4886 [4:29:41<1:22:18,  3.67s/it]


Progress saved to papers_analysis_itercheckpoint_3540_20250107_183403.json


Analyzing papers:  73%|████████████▍    | 3559/4886 [4:31:00<1:27:45,  3.97s/it]


Progress saved to papers_analysis_itercheckpoint_3560_20250107_183522.json


Analyzing papers:  73%|████████████▍    | 3579/4886 [4:32:09<1:16:07,  3.49s/it]


Progress saved to papers_analysis_itercheckpoint_3580_20250107_183631.json


Analyzing papers:  74%|████████████▌    | 3599/4886 [4:33:20<1:16:53,  3.58s/it]


Progress saved to papers_analysis_itercheckpoint_3600_20250107_183741.json


Analyzing papers:  74%|████████████▌    | 3619/4886 [4:34:33<1:17:42,  3.68s/it]


Progress saved to papers_analysis_itercheckpoint_3620_20250107_183854.json


Analyzing papers:  74%|████████████▋    | 3639/4886 [4:35:47<1:20:22,  3.87s/it]


Progress saved to papers_analysis_itercheckpoint_3640_20250107_184009.json


Analyzing papers:  75%|████████████▋    | 3659/4886 [4:37:05<1:12:36,  3.55s/it]


Progress saved to papers_analysis_itercheckpoint_3660_20250107_184126.json


Analyzing papers:  75%|████████████▋    | 3664/4886 [4:37:22<1:10:36,  3.47s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresponding to di

Analyzing papers:  75%|████████████▊    | 3679/4886 [4:38:22<1:20:42,  4.01s/it]


Progress saved to papers_analysis_itercheckpoint_3680_20250107_184243.json


Analyzing papers:  76%|████████████▊    | 3699/4886 [4:39:39<1:06:32,  3.36s/it]


Progress saved to papers_analysis_itercheckpoint_3700_20250107_184401.json


Analyzing papers:  76%|████████████▉    | 3719/4886 [4:40:56<1:18:18,  4.03s/it]


Progress saved to papers_analysis_itercheckpoint_3720_20250107_184518.json


Analyzing papers:  77%|█████████████    | 3739/4886 [4:42:10<1:12:26,  3.79s/it]


Progress saved to papers_analysis_itercheckpoint_3740_20250107_184634.json


Analyzing papers:  77%|█████████████    | 3759/4886 [4:43:28<1:03:52,  3.40s/it]


Progress saved to papers_analysis_itercheckpoint_3760_20250107_184750.json


Analyzing papers:  77%|█████████████▏   | 3779/4886 [4:44:44<1:09:49,  3.78s/it]


Progress saved to papers_analysis_itercheckpoint_3780_20250107_184905.json


Analyzing papers:  78%|█████████████▏   | 3799/4886 [4:46:07<1:42:56,  5.68s/it]


Progress saved to papers_analysis_itercheckpoint_3800_20250107_185029.json


Analyzing papers:  78%|█████████████▎   | 3819/4886 [4:47:25<1:19:24,  4.47s/it]


Progress saved to papers_analysis_itercheckpoint_3820_20250107_185146.json


Analyzing papers:  79%|█████████████▎   | 3839/4886 [4:48:47<1:16:36,  4.39s/it]


Progress saved to papers_analysis_itercheckpoint_3840_20250107_185309.json


Analyzing papers:  79%|█████████████▍   | 3859/4886 [4:50:02<1:04:06,  3.75s/it]


Progress saved to papers_analysis_itercheckpoint_3860_20250107_185424.json


Analyzing papers:  79%|█████████████▍   | 3879/4886 [4:51:18<1:06:20,  3.95s/it]


Progress saved to papers_analysis_itercheckpoint_3880_20250107_185539.json


Analyzing papers:  80%|█████████████▌   | 3899/4886 [4:52:33<1:01:02,  3.71s/it]


Progress saved to papers_analysis_itercheckpoint_3900_20250107_185654.json


Analyzing papers:  80%|█████████████▋   | 3919/4886 [4:53:52<1:02:49,  3.90s/it]


Progress saved to papers_analysis_itercheckpoint_3920_20250107_185818.json


Analyzing papers:  81%|███████████████▎   | 3939/4886 [4:55:11<57:23,  3.64s/it]


Progress saved to papers_analysis_itercheckpoint_3940_20250107_185933.json


Analyzing papers:  81%|███████████████▍   | 3959/4886 [4:56:28<57:34,  3.73s/it]


Progress saved to papers_analysis_itercheckpoint_3960_20250107_190049.json


Analyzing papers:  81%|█████████████▊   | 3979/4886 [4:57:47<1:09:44,  4.61s/it]


Progress saved to papers_analysis_itercheckpoint_3980_20250107_190210.json


Analyzing papers:  82%|█████████████▉   | 3999/4886 [4:59:24<1:05:01,  4.40s/it]


Progress saved to papers_analysis_itercheckpoint_4000_20250107_190346.json


Analyzing papers:  82%|███████████████▋   | 4019/4886 [5:00:46<58:02,  4.02s/it]


Progress saved to papers_analysis_itercheckpoint_4020_20250107_190508.json


Analyzing papers:  83%|███████████████▋   | 4039/4886 [5:02:15<58:03,  4.11s/it]


Progress saved to papers_analysis_itercheckpoint_4040_20250107_190637.json


Analyzing papers:  83%|███████████████▊   | 4059/4886 [5:03:36<50:11,  3.64s/it]


Progress saved to papers_analysis_itercheckpoint_4060_20250107_190758.json


Analyzing papers:  83%|███████████████▊   | 4079/4886 [5:04:48<50:03,  3.72s/it]


Progress saved to papers_analysis_itercheckpoint_4080_20250107_190909.json


Analyzing papers:  84%|██████████████▎  | 4099/4886 [5:06:13<1:06:44,  5.09s/it]


Progress saved to papers_analysis_itercheckpoint_4100_20250107_191034.json


Analyzing papers:  84%|████████████████   | 4119/4886 [5:07:36<58:50,  4.60s/it]


Progress saved to papers_analysis_itercheckpoint_4120_20250107_191158.json


Analyzing papers:  85%|████████████████   | 4139/4886 [5:08:57<46:25,  3.73s/it]


Progress saved to papers_analysis_itercheckpoint_4140_20250107_191318.json


Analyzing papers:  85%|████████████████▏  | 4159/4886 [5:10:10<44:14,  3.65s/it]


Progress saved to papers_analysis_itercheckpoint_4160_20250107_191432.json


Analyzing papers:  86%|████████████████▎  | 4179/4886 [5:11:27<45:19,  3.85s/it]


Progress saved to papers_analysis_itercheckpoint_4180_20250107_191548.json


Analyzing papers:  86%|████████████████▎  | 4199/4886 [5:12:48<47:42,  4.17s/it]


Progress saved to papers_analysis_itercheckpoint_4200_20250107_191709.json


Analyzing papers:  86%|████████████████▍  | 4219/4886 [5:14:04<44:28,  4.00s/it]


Progress saved to papers_analysis_itercheckpoint_4220_20250107_191825.json


Analyzing papers:  87%|████████████████▍  | 4227/4886 [5:14:33<40:26,  3.68s/it]


Attempt 1/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 2/3 - Error: Encountered text corresponding to disallowed special token '<|endoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|endoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|endoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


Attempt 3/3 - Error: Encountered text corresponding to di

Analyzing papers:  87%|████████████████▍  | 4239/4886 [5:15:23<45:24,  4.21s/it]


Progress saved to papers_analysis_itercheckpoint_4240_20250107_191945.json


Analyzing papers:  87%|████████████████▌  | 4259/4886 [5:16:45<45:16,  4.33s/it]


Progress saved to papers_analysis_itercheckpoint_4260_20250107_192109.json


Analyzing papers:  88%|████████████████▋  | 4279/4886 [5:18:02<42:11,  4.17s/it]


Progress saved to papers_analysis_itercheckpoint_4280_20250107_192223.json


Analyzing papers:  88%|████████████████▋  | 4299/4886 [5:19:36<41:05,  4.20s/it]


Progress saved to papers_analysis_itercheckpoint_4300_20250107_192357.json


Analyzing papers:  88%|████████████████▊  | 4319/4886 [5:20:55<35:08,  3.72s/it]


Progress saved to papers_analysis_itercheckpoint_4320_20250107_192517.json


Analyzing papers:  89%|████████████████▊  | 4339/4886 [5:22:36<47:27,  5.21s/it]


Progress saved to papers_analysis_itercheckpoint_4340_20250107_192657.json


Analyzing papers:  89%|████████████████▉  | 4359/4886 [5:24:06<38:06,  4.34s/it]


Progress saved to papers_analysis_itercheckpoint_4360_20250107_192828.json


Analyzing papers:  90%|█████████████████  | 4379/4886 [5:25:46<33:29,  3.96s/it]


Progress saved to papers_analysis_itercheckpoint_4380_20250107_193007.json


Analyzing papers:  90%|█████████████████  | 4399/4886 [5:27:07<34:10,  4.21s/it]


Progress saved to papers_analysis_itercheckpoint_4400_20250107_193128.json


Analyzing papers:  90%|█████████████████▏ | 4419/4886 [5:28:36<37:23,  4.80s/it]


Progress saved to papers_analysis_itercheckpoint_4420_20250107_193258.json


Analyzing papers:  91%|█████████████████▎ | 4439/4886 [5:30:09<28:38,  3.85s/it]


Progress saved to papers_analysis_itercheckpoint_4440_20250107_193431.json


Analyzing papers:  91%|█████████████████▎ | 4459/4886 [5:31:27<26:57,  3.79s/it]


Progress saved to papers_analysis_itercheckpoint_4460_20250107_193549.json


Analyzing papers:  92%|█████████████████▍ | 4479/4886 [5:33:04<32:49,  4.84s/it]


Progress saved to papers_analysis_itercheckpoint_4480_20250107_193726.json


Analyzing papers:  92%|█████████████████▍ | 4499/4886 [5:34:37<26:41,  4.14s/it]


Progress saved to papers_analysis_itercheckpoint_4500_20250107_193859.json


Analyzing papers:  92%|█████████████████▌ | 4519/4886 [5:36:00<27:03,  4.42s/it]


Progress saved to papers_analysis_itercheckpoint_4520_20250107_194022.json


Analyzing papers:  93%|█████████████████▋ | 4539/4886 [5:37:25<23:55,  4.14s/it]


Progress saved to papers_analysis_itercheckpoint_4540_20250107_194146.json


Analyzing papers:  93%|█████████████████▋ | 4559/4886 [5:38:44<19:29,  3.58s/it]


Progress saved to papers_analysis_itercheckpoint_4560_20250107_194308.json


Analyzing papers:  94%|█████████████████▊ | 4579/4886 [5:40:08<19:33,  3.82s/it]


Progress saved to papers_analysis_itercheckpoint_4580_20250107_194429.json


Analyzing papers:  94%|█████████████████▉ | 4599/4886 [5:41:53<36:55,  7.72s/it]


Progress saved to papers_analysis_itercheckpoint_4600_20250107_194615.json


Analyzing papers:  95%|█████████████████▉ | 4619/4886 [5:43:26<18:27,  4.15s/it]


Progress saved to papers_analysis_itercheckpoint_4620_20250107_194748.json


Analyzing papers:  95%|██████████████████ | 4639/4886 [5:45:06<22:13,  5.40s/it]


Progress saved to papers_analysis_itercheckpoint_4640_20250107_194928.json


Analyzing papers:  95%|██████████████████ | 4659/4886 [5:46:37<17:52,  4.73s/it]


Progress saved to papers_analysis_itercheckpoint_4660_20250107_195059.json


Analyzing papers:  96%|██████████████████▏| 4679/4886 [5:48:01<13:59,  4.05s/it]


Progress saved to papers_analysis_itercheckpoint_4680_20250107_195224.json


Analyzing papers:  96%|██████████████████▎| 4699/4886 [5:49:28<12:54,  4.14s/it]


Progress saved to papers_analysis_itercheckpoint_4700_20250107_195350.json


Analyzing papers:  97%|██████████████████▎| 4719/4886 [5:50:48<11:16,  4.05s/it]


Progress saved to papers_analysis_itercheckpoint_4720_20250107_195510.json


Analyzing papers:  97%|██████████████████▍| 4739/4886 [5:52:10<09:36,  3.92s/it]


Progress saved to papers_analysis_itercheckpoint_4740_20250107_195632.json


Analyzing papers:  97%|██████████████████▌| 4759/4886 [5:53:41<10:45,  5.08s/it]


Progress saved to papers_analysis_itercheckpoint_4760_20250107_195802.json


Analyzing papers:  98%|██████████████████▌| 4779/4886 [5:55:09<09:08,  5.12s/it]


Progress saved to papers_analysis_itercheckpoint_4780_20250107_195930.json


Analyzing papers:  98%|██████████████████▋| 4799/4886 [5:56:33<05:29,  3.79s/it]


Progress saved to papers_analysis_itercheckpoint_4800_20250107_200055.json


Analyzing papers:  99%|██████████████████▋| 4819/4886 [5:58:20<07:04,  6.34s/it]


Progress saved to papers_analysis_itercheckpoint_4820_20250107_200242.json


Analyzing papers:  99%|██████████████████▊| 4839/4886 [5:59:56<03:38,  4.64s/it]


Progress saved to papers_analysis_itercheckpoint_4840_20250107_200417.json


Analyzing papers:  99%|██████████████████▉| 4859/4886 [6:01:42<02:15,  5.04s/it]


Progress saved to papers_analysis_itercheckpoint_4860_20250107_200604.json


Analyzing papers: 100%|██████████████████▉| 4879/4886 [6:03:11<00:30,  4.31s/it]


Progress saved to papers_analysis_itercheckpoint_4880_20250107_200733.json


Analyzing papers: 100%|███████████████████| 4886/4886 [6:03:41<00:00,  4.47s/it]



Progress saved to papers_analysis_iterfinal_20250107_200800.json


In [4]:
import json
from collections import Counter
from typing import Dict, Any
import pandas as pd
import re

def normalize_model_name(name: str) -> str:
    """
    Normalizes model names to handle different variations.
    
    Args:
        name: Original model name
        
    Returns:
        Normalized model name
    """
    # Remove quotation marks and extra spaces
    name = name.strip('"').strip()
    
    # Convert to lowercase for initial processing
    normalized = name.lower()
    
    # Remove extra spaces between parts
    normalized = re.sub(r'\s+', ' ', normalized)
    
    # Standardize common variations while preserving distinct models
    replacements = {
        'gpt-3.5-turbo': ['gpt-3.5 turbo', 'gpt3.5-turbo', 'gpt3.5 turbo', 'gpt-3.5-t', 'chatgpt', 'chat-gpt', 'chat gpt'],  # ChatGPT uses GPT-3.5-turbo
        'gpt-3.5': ['gpt3.5', 'gpt 3.5'],  # Keep base GPT-3.5 separate
        'gpt-4': ['gpt4', 'gpt 4', 'gpt-4-turbo'],  # Group GPT-4 variations
        'llama-2': ['llama2', 'llama-2', 'llama 2'],  # Standardize LLaMA 2 naming
        'llama': ['llama1', 'llama 1'],  # Keep original LLaMA separate
        'roberta': ['roberta-base', 'roberta base'],
        'bert': ['bert-base', 'bert base'],
    }
    
    # Apply replacements
    for standard, variants in replacements.items():
        if normalized in variants or normalized == standard:
            return standard
            
    # Standardize separators
    normalized = re.sub(r'[-_\s]+', '-', normalized)
    
    return normalized

def analyze_paper_data(data: Dict[str, Any]) -> tuple[Counter, Counter]:
    """
    Recursively analyzes paper data to count unique models and benchmarks.
    """
    model_counter = Counter()
    benchmark_counter = Counter()
    
    def process_item(item):
        if isinstance(item, dict):
            # Handle base_models section
            if 'base_models' in item and isinstance(item['base_models'], list):
                for model in item['base_models']:
                    if isinstance(model, str):
                        normalized_name = normalize_model_name(model)
                        model_counter[normalized_name] += 1
            
            # Handle benchmarks section
            if 'benchmarks' in item and isinstance(item['benchmarks'], list):
                for benchmark in item['benchmarks']:
                    if isinstance(benchmark, str):
                        normalized_name = benchmark.strip('"').strip().lower()
                        normalized_name = re.sub(r'[-_\s]+', '-', normalized_name)
                        benchmark_counter[normalized_name] += 1
            
            # Recursively process all dictionary values
            for value in item.values():
                process_item(value)
                
        elif isinstance(item, list):
            # Recursively process all list items
            for value in item:
                process_item(value)
    
    process_item(data)
    return model_counter, benchmark_counter

def create_frequency_dataframes(model_counter: Counter, benchmark_counter: Counter) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Creates pandas DataFrames with frequency information.
    """
    # Create DataFrame for models
    models_df = pd.DataFrame([
        {"model": model, "frequency": count}
        for model, count in model_counter.most_common()
    ])
    
    # Create DataFrame for benchmarks
    benchmarks_df = pd.DataFrame([
        {"benchmark": benchmark, "frequency": count}
        for benchmark, count in benchmark_counter.most_common()
    ])
    
    return models_df, benchmarks_df

def print_statistics(models_df: pd.DataFrame, benchmarks_df: pd.DataFrame):
    """
    Prints detailed statistics about models and benchmarks.
    """
    print("=== Model Statistics ===")
    print(f"Total unique models (after normalization): {len(models_df)}")
    print(f"Total model mentions: {models_df['frequency'].sum()}")
    print("\nTop 20 most frequently mentioned models:")
    print(models_df.head(20).to_string(index=False))
    
    print("\n=== Benchmark Statistics ===")
    print(f"Total unique benchmarks: {len(benchmarks_df)}")
    print(f"Total benchmark mentions: {benchmarks_df['frequency'].sum()}")
    print("\nTop 20 most frequently mentioned benchmarks:")
    print(benchmarks_df.head(20).to_string(index=False))

def main():
    try:
        # Load and process data
        with open('papers_analysis_iterfinal_20250107_200800.json', 'r') as f:
            data = json.load(f)
        
        # Get frequency counts
        model_counter, benchmark_counter = analyze_paper_data(data)
        
        # Create DataFrames
        models_df, benchmarks_df = create_frequency_dataframes(model_counter, benchmark_counter)
        
        # Save frequency data to CSV
        models_df.to_csv('model_frequencies_normalized.csv', index=False)
        benchmarks_df.to_csv('benchmark_frequencies_normalized.csv', index=False)
        
        # Print statistics
        print_statistics(models_df, benchmarks_df)
        
        print("\nDetailed frequency data has been saved to:")
        print("- model_frequencies_normalized.csv")
        print("- benchmark_frequencies_normalized.csv")
        
    except FileNotFoundError:
        print("Error: papers_analysis_iterfinal_20250107_200800.json file not found")
    except json.JSONDecodeError:
        print("Error: Invalid JSON format in the input file")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

if __name__ == "__main__":
    main()

=== Model Statistics ===
Total unique models (after normalization): 4809
Total model mentions: 16665

Top 20 most frequently mentioned models:
        model  frequency
        gpt-4       1579
gpt-3.5-turbo        743
      gpt-3.5        530
       gpt-4o        251
        gpt-3        250
         bert        198
   mistral-7b        183
   llama-2-7b        165
     llama-7b        140
        llama        140
    llama2-7b        134
       gpt-4v        120
      llama-2        109
      roberta        108
  llama-2-13b        102
 gpt-3-(175b)         98
  llama-2-70b         95
        gpt-2         90
   vicuna-13b         86
           t5         78

=== Benchmark Statistics ===
Total unique benchmarks: 7635
Total benchmark mentions: 15013

Top 20 most frequently mentioned benchmarks:
    benchmark  frequency
        gsm8k        424
         math        175
         mmlu        171
        svamp        139
   strategyqa        126
     hotpotqa        120
    humaneval      

In [1]:
import json
import arxiv
import time
import re
from datetime import datetime
from tqdm import tqdm

def clean_title(title):
    """Clean and normalize the title for better matching"""
    # Remove special characters and extra whitespace
    cleaned = re.sub(r'[^\w\s-]', '', title)
    cleaned = ' '.join(cleaned.split())
    return cleaned

def fetch_arxiv_date(title):
    """
    Fetch publication date from arXiv given a paper title.
    Try multiple search strategies.
    """
    client = arxiv.Client()
    
    # Strategy 1: Exact title match
    search = arxiv.Search(
        query=f'ti:"{title}"',
        max_results=1
    )
    result = next(client.results(search), None)
    if result:
        return result.published.strftime('%Y-%m-%d'), result.entry_id

    time.sleep(1)  # Rate limiting between attempts
    
    # Strategy 2: Search without quotes and ti: prefix
    cleaned_title = clean_title(title)
    search = arxiv.Search(
        query=cleaned_title,
        max_results=5  # Get more results to find potential matches
    )
    
    # Check each result for title similarity
    results = list(client.results(search))
    for result in results:
        result_title = clean_title(result.title)
        # Check if titles are very similar
        if (cleaned_title.lower() in result_title.lower() or 
            result_title.lower() in cleaned_title.lower()):
            return result.published.strftime('%Y-%m-%d'), result.entry_id
    
    time.sleep(1)  # Rate limiting between attempts
    
    # Strategy 3: Try with first few words of title
    first_words = ' '.join(cleaned_title.split()[:4])  # First 4 words
    search = arxiv.Search(
        query=f'ti:"{first_words}"',
        max_results=5
    )
    
    results = list(client.results(search))
    for result in results:
        result_title = clean_title(result.title)
        if cleaned_title.lower() in result_title.lower():
            return result.published.strftime('%Y-%m-%d'), result.entry_id
            
    return None, None

# Load your JSON data
with open('papers_analysis_iterfinal_20250107_200800.json', 'r') as f:
    papers_data = json.load(f)

# Add dates to the data
for title in tqdm(papers_data.keys(), desc="Fetching paper dates", unit="paper"):
    paper_info = papers_data[title]
    date, arxiv_id = fetch_arxiv_date(title)
    if date:
        paper_info['publication_date'] = date
        paper_info['arxiv_id'] = arxiv_id
        # tqdm.write(f"✓ Found date for: {title} ({date})")
    else:
        continue
        # tqdm.write(f"✗ Could not find date for: {title}")
    # Respect rate limits
    time.sleep(2)

# Save the updated data
with open('papers_data_with_dates.json', 'w') as f:
    json.dump(papers_data, f, indent=4)

print("\nDone! Checking results...")
# Print summary of papers without dates
missing_dates = [title for title, info in papers_data.items() 
                if 'publication_date' not in info]
if missing_dates:
    print(f"\nCould not find dates for {len(missing_dates)} papers:")
    for title in missing_dates:
        print(f"- {title}")
else:
    print("\nFound dates for all papers!")

Fetching paper dates: 100%|████████████| 4886/4886 [6:25:52<00:00,  4.74s/paper]


Done! Checking results...

Could not find dates for 330 papers:
- Energy-Based Diffusion Language Models for Text Generation
- Advancing GenAI Assisted Programming-A Comparative Study on Prompt Efficiency and Code Quality Between GPT-4 and GLM-4
- Meaningful Learning Advancing Abstract Reasoning in Large Language Models via Generic Fact Guidance
- Autonomous Tree-search Ability of Large Language Models
- Retrieval-Augmented Generation for AI-Generated Content A Survey
- Self-Consistency Preference Optimization
- Language Agents Meet Causality - Bridging LLMs and Causal World Models
- Ambiguity-Aware In-Context Learning with Large Language Models
- AVA Towards Autonomous Visualization Agents through Visual PerceptionDriven DecisionMaking
- Prompt Perturbation in Retrieval-Augmented Generation based Large Language Models
- Self-Supervised Multimodal Learning A Survey
- Non-myopic Generation of Language Models for Reasoning and Planning
- Towards More Effective Table-to-Text Generation A




In [15]:
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter
import re
import numpy as np
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
from scipy import stats

def normalize_model_name(name: str) -> tuple:
    """
    Normalizes model names while preserving important distinctions between model sizes
    and architectural variants. Returns tuple of (normalized_name, original_name)
    """
    original = name.strip('"').strip()
    normalized = original.lower()
    normalized = re.sub(r'\s+', ' ', normalized)
    
    # Extract parameter count/size information
    size_pattern = r'[\(\[\s]?(\d+[bm])[\)\]\s]?'
    size_match = re.search(size_pattern, normalized, re.IGNORECASE)
    model_size = size_match.group(1).lower() if size_match else None
    
    # First check for GPT-4 variants
    gpt4_patterns = [
        'gpt4', 'gpt 4', 'gpt-4', 'gpt-4-turbo', 'gpt4-turbo',
        'chatgpt-4', 'chat-gpt-4', 'chatgpt-4.0', 'chat gpt 4', 
        'chatgpt 4', 'gpt-4 turbo', 'gpt 4 turbo'
    ]
    
    if any(re.match(f'^{re.escape(pattern)}', normalized) for pattern in gpt4_patterns):
        if any(v in normalized for v in ['vision', '-v', 'v']):
            return 'gpt-4v', original
        elif any(o in normalized for o in ['4o', '4-o', '4 o']):
            return 'gpt-4o', original
        return 'gpt-4', 'GPT-4'
    
    # Model family mappings with their variants
    base_models = {
        'gpt-3.5': [
            'gpt-3.5-turbo', 'gpt3.5-turbo', 'gpt3.5 turbo', 
            'gpt-3.5-t', 'chatgpt', 'chat-gpt', 'chat gpt', 
            'chatgpt-3.5', 'chatgpt-3.5-turbo', 'chat-gpt-3.5',
            'chat gpt 3.5', 'chatgpt 3.5', 'gpt3.5'
        ],
        'gpt-3': [
            'gpt-3', 'gpt3', 'gpt 3', 'gpt-3-davinci',
            'davinci', 'text-davinci', 'davinci-002',
            'chatgpt-3', 'chat-gpt-3', 'gpt-3-(175b)',
            'gpt-3 (175b)', 'gpt-3 (davinci)'
        ],
        'llama-2': [
            'llama2', 'llama 2', 'llama-2', 'llama2-', 
            'llama-2-', 'llama 2 ', 'llama2 ', 'llama-2 '
        ],
        'llama': [
            'llama1', 'llama 1', 'llama-1', 'llama-', 
            'llama ', 'llamab', 'llama-b'
        ],
        'mistral': [
            'mistral', 'mistral-', 'mistral '
        ],
        'palm': ['palm-', 'palm2', 'palm 2', 'palm-2'],
        'bert': ['bert-base', 'bert base', 'bert-large', 'bert large'],
        'roberta': ['roberta-base', 'roberta base', 'roberta-large', 'roberta large'],
        't5': ['t5-', 't5 ', 't5base', 't5-base', 't5 base']
    }
    
    # Check for other model matches
    for family, variants in base_models.items():
        patterns = [f'^{re.escape(v)}' for v in variants + [family]]
        if any(re.match(pattern, normalized) for pattern in patterns):
            if family in ['llama', 'llama-2', 'mistral']:
                base = family
                
                if any(v in normalized for v in ['instruct', 'ins']):
                    base = f"{base}-instruct"
                elif 'chat' in normalized:
                    base = f"{base}-chat"
                    
                if family == 'mistral' and model_size:
                    return f"{base}-{model_size}", original
                elif model_size:
                    return f"{base}-{model_size}", original
                    
                return base, original
            
            if model_size and family in ['bert', 'roberta', 't5']:
                return f"{family}-{model_size}", original
            
            return family, original
    
    # Remove parenthetical details and extra whitespace
    normalized = re.sub(r'\s*\([^)]*\)\s*', '', normalized)
    normalized = re.sub(r'\s*\[[^\]]*\]\s*', '', normalized)
    normalized = re.sub(r'[-_\s]+', '-', normalized)
    
    if model_size and not normalized.endswith(model_size):
        normalized = f"{normalized}-{model_size}"
        
    return normalized, original

def plot_ridgeline(ax, df, names, counter, colors, vertical_spacing, original_names=None):
    """Plot the ridgeline distribution"""
    # Set fixed date range
    date_min = datetime(2022, 1, 1)
    date_max = datetime(2024, 12, 31)
    
    months = pd.date_range(start=date_min, end=date_max, freq='M')
    month_width = (months[1] - months[0])
    
    monthly_totals = df.groupby(df['date'].dt.to_period('M'))['name'].value_counts()
    max_freq = 0
    
    for idx, name in enumerate(reversed(names)):
        if name in monthly_totals.unstack():
            monthly_counts = monthly_totals.unstack()[name].fillna(0)
            monthly_sums = monthly_totals.groupby('date').sum()
            monthly_freq = monthly_counts / monthly_sums
            max_freq = max(max_freq, monthly_freq.max())
            
            plot_dates = pd.to_datetime([period.to_timestamp() for period in monthly_freq.index])
            baseline = idx * vertical_spacing
            
            # Plot filled bars
            ax.bar(plot_dates, 
                  monthly_freq,
                  width=month_width,
                  bottom=baseline,
                  color=colors[min(idx, len(colors)-1)],
                  alpha=0.6,
                  align='edge',
                  zorder=idx)
            
            # Add border
            ax.bar(plot_dates,
                  monthly_freq,
                  width=month_width,
                  bottom=baseline,
                  color='none',
                  edgecolor=colors[min(idx, len(colors)-1)],
                  linewidth=1.0,
                  alpha=0.9,
                  align='edge',
                  zorder=idx)
            
            # Add peak value label
            max_month_idx = monthly_freq.argmax()
            max_month_date = monthly_freq.index[max_month_idx].to_timestamp()
            ax.text(max_month_date, 
                   baseline + monthly_freq[max_month_idx], 
                   f'{monthly_freq[max_month_idx]:.2f}',
                   ha='left', va='bottom', fontsize=8)
            
            # Add horizontal line
            ax.axhline(y=baseline,
                      color='gray',
                      linewidth=0.2,
                      alpha=0.3,
                      zorder=0)
            
            # Add labels on the right side
            display_name = original_names.get(name, name) if original_names else name
            ax.annotate(f"{display_name} ({counter[name]})",
                       xy=(date_max, baseline),
                       xytext=(5, 0),
                       textcoords="offset points",
                       va='center',
                       fontsize=9,
                       fontfamily='serif',
                       fontstyle='normal',
                       bbox=dict(facecolor='white',
                               edgecolor='none',
                               alpha=0.9,
                               pad=1))

def create_combined_plot(df, names, counter, title, filename, color_h, original_names=None):
    """Create a combined plot with paper counts and distribution"""
    plt.style.use('default')
    plt.rcParams.update({
        'font.family': 'serif',
        'font.size': 10,
        'axes.labelsize': 11,
        'axes.titlesize': 12,
        'figure.titlesize': 14
    })

    fig = plt.figure(figsize=(12, 10))
    
    # Create top subplot for paper counts (25% of height)
    ax1 = plt.subplot2grid((4, 1), (0, 0), rowspan=1)
    
    # Get total mentions per month
    monthly_mentions = df.groupby(df['date'].dt.to_period('M')).size()
    
    start_date = datetime(2022, 1, 1)
    end_date = datetime(2024, 12, 31)
    
    # First, filter papers_df to the date range we want
    papers_df_filtered = papers_df[
        (papers_df.index >= start_date) & 
        (papers_df.index <= end_date)
    ]
    
    # Create date range and align data
    monthly_dates = pd.date_range(start=start_date, end=end_date, freq='M')
    
    # Only include months where we actually have papers
    valid_months = papers_df_filtered.index.to_period('M')
    monthly_mentions = monthly_mentions[monthly_mentions.index.isin(valid_months)]
    
    # Resample papers data
    monthly_papers = papers_df_filtered.resample('M')['total_papers'].last()
    
    # Ensure both series align with actual paper months
    monthly_papers = monthly_papers[monthly_papers > 0]
    monthly_mentions = monthly_mentions[monthly_mentions.index.isin(monthly_papers.index.to_period('M'))]
    
    # Create date range for plotting
    start_date = datetime(2022, 1, 1)
    end_date = datetime(2024, 12, 31)
    monthly_dates = pd.date_range(start=start_date, end=end_date, freq='MS')  # MS = Month Start
    
    # Filter and align the data
    papers_df_filtered = papers_df[
        (papers_df.index >= start_date) & 
        (papers_df.index <= end_date)
    ]
    
    # Resample to month start frequency
    monthly_papers = papers_df_filtered.resample('MS')['total_papers'].last()
    monthly_mentions = df.groupby(df['date'].dt.to_period('M')).size()
    
    # Ensure both series align with actual paper months
    monthly_papers = monthly_papers[monthly_papers > 0]
    monthly_mentions = monthly_mentions[monthly_mentions.index.isin(monthly_papers.index.to_period('M'))]
    
    # Convert period index to timestamp for plotting
    plot_dates = pd.date_range(start=monthly_papers.index.min(), 
                             end=monthly_papers.index.max(), 
                             freq='MS')
    
    # Plot paper counts - align to start of each month
    bars = ax1.bar(plot_dates,
                   monthly_papers.values,
                   width=30,
                   color='gray',
                   alpha=0.5,
                   align='edge')  # Align to start of month

    # Customize top subplot
    ax1.set_xlim(start_date, end_date)
    ax1.spines['top'].set_visible(False)
    
    # Add value labels for paper counts
    for bar in bars:
        height = bar.get_height()
        if height > 0:
            ax1.text(bar.get_x() + bar.get_width()/2, height,
                    f'{int(height)}',
                    ha='center', va='bottom', fontsize=8)
    
    # Create bottom subplot for distribution (75% of height)
    ax2 = plt.subplot2grid((4, 1), (1, 0), rowspan=3)
    
    n_shades = len(names)
    vertical_spacing = 0.35
    colors = sns.husl_palette(n_shades, h=color_h, s=0.85, l=0.6)
    
    plot_ridgeline(ax2, df, names, counter, colors, vertical_spacing, original_names)
    
    ax2.set_xlim(start_date, end_date)
    ax2.yaxis.set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.spines['left'].set_visible(False)
    
    # Format x-axis with year ticks
    for ax in [ax1, ax2]:
        # Set major ticks only at start of year
        ax.xaxis.set_major_locator(mdates.YearLocator(month=1, day=1))
        
        # Remove year labels from ticks
        ax.xaxis.set_major_formatter(plt.NullFormatter())
        
        # Handle year labels differently for each subplot
        if ax == ax1:  # Top subplot
            y_pos = ax.get_ylim()[0] - 0.1
        else:  # Bottom subplot (ax2)
            y_pos = -0.02  # Reduced negative value to move labels up
            
        # Manually add year labels at the middle of each year
        years = range(2022, 2025)
        for year in years:
            mid_year_date = datetime(year, 7, 1)
            ax.text(mdates.date2num(mid_year_date), y_pos,
                   str(year),
                   ha='center',
                   va='top',
                   transform=ax.get_xaxis_transform())
        
        # Add minor ticks for all months
        ax.xaxis.set_minor_locator(mdates.MonthLocator())
        
        # Customize tick lengths
        ax.tick_params(axis='x', which='major', length=8)  # Longer ticks for years
        ax.tick_params(axis='x', which='minor', length=4)  # Shorter ticks for months
        
        # Adjust label positions to center them
        ax.tick_params(axis='x', which='major', pad=10)  # Add padding for year labels
        
        # Add subtle grid
        ax.grid(True, alpha=0.1, linestyle='-', axis='x')
        ax.grid(True, alpha=0.05, linestyle='-', axis='x', which='minor')
    
    # Only show x-label on bottom plot
    ax1.set_xlabel('')
    ax2.set_xlabel('')
    
    # Adjust layout
    plt.subplots_adjust(right=0.85, hspace=0.3)
    
    plt.savefig(filename,
                dpi=300,
                bbox_inches='tight',
                facecolor='white')
    plt.close()

# Load and process data
with open('papers_data_with_dates.json', 'r') as f:
    papers_data = json.load(f)

# Calculate total papers per month
all_papers_by_month = {}
for title, paper_info in papers_data.items():
    if 'publication_date' in paper_info:
        date = paper_info['publication_date']
        date_obj = datetime.strptime(date, '%Y-%m-%d')
        month_key = date_obj.strftime('%Y-%m')
        all_papers_by_month[month_key] = all_papers_by_month.get(month_key, 0) + 1

# Convert to DataFrame for plotting
papers_df = pd.DataFrame(
    [(datetime.strptime(k, '%Y-%m'), v) for k, v in all_papers_by_month.items()],
    columns=['date', 'total_papers']
)
papers_df.set_index('date', inplace=True)

# Get frequency counts
model_counter = Counter()
benchmark_counter = Counter()
model_original_names = {}
benchmark_original_names = {}
benchmark_records = []
model_records = []

# Process the data
for title, paper_info in papers_data.items():
    if 'publication_date' in paper_info and 'analysis' in paper_info:
        date = paper_info['publication_date']
        date_obj = datetime.strptime(date, '%Y-%m-%d')
        
        # Only process data from 2022 onwards
        if date_obj.year >= 2022:
            # Process benchmarks
            if 'benchmarks' in paper_info['analysis']:
                for benchmark in paper_info['analysis']['benchmarks']:
                    normalized_benchmark = benchmark.strip('"').strip().lower()
                    normalized_benchmark = re.sub(r'[-_\s]+', '-', normalized_benchmark)
                    benchmark_counter[normalized_benchmark] += 1
                    benchmark_original_names[normalized_benchmark] = benchmark.strip('"').strip()
                    benchmark_records.append({
                        'date': date_obj,
                        'name': normalized_benchmark,
                        'type': 'Benchmark',
                        'paper': title
                    })
            
            # Process models
            if 'base_models' in paper_info['analysis']:
                for model in paper_info['analysis']['base_models']:
                    normalized_model, original_name = normalize_model_name(model)
                    model_counter[normalized_model] += 1
                    if normalized_model not in model_original_names:
                        model_original_names[normalized_model] = original_name
                    model_records.append({
                        'date': date_obj,
                        'name': normalized_model,
                        'type': 'Model',
                        'paper': title
                    })

# Get top 20 benchmarks and models
top_20_benchmarks = [b[0] for b in benchmark_counter.most_common(20)]
top_20_models = [m[0] for m in model_counter.most_common(20)]

# Create DataFrames
df_benchmarks = pd.DataFrame(benchmark_records)
df_models = pd.DataFrame(model_records)

# Filter for top 20
df_benchmarks = df_benchmarks[df_benchmarks['name'].isin(top_20_benchmarks)]
df_models = df_models[df_models['name'].isin(top_20_models)]

# Update DataFrames to use datetime
df_benchmarks['date'] = pd.to_datetime(df_benchmarks['date'])
df_models['date'] = pd.to_datetime(df_models['date'])

# Calculate statistics
total_papers = len(set(title for title, paper_info in papers_data.items() 
                      if 'publication_date' in paper_info 
                      and datetime.strptime(paper_info['publication_date'], '%Y-%m-%d').year >= 2022))

total_model_mentions = 0
total_benchmark_mentions = 0
papers_with_models = 0
papers_with_benchmarks = 0

for title, paper_info in papers_data.items():
    if 'publication_date' in paper_info and 'analysis' in paper_info:
        date = datetime.strptime(paper_info['publication_date'], '%Y-%m-%d')
        if date.year >= 2022:
            # Count model mentions
            if 'base_models' in paper_info['analysis'] and paper_info['analysis']['base_models']:
                papers_with_models += 1
                total_model_mentions += len(paper_info['analysis']['base_models'])
            
            # Count benchmark mentions
            if 'benchmarks' in paper_info['analysis'] and paper_info['analysis']['benchmarks']:
                papers_with_benchmarks += 1
                total_benchmark_mentions += len(paper_info['analysis']['benchmarks'])

# Calculate averages
avg_models_per_paper = total_model_mentions / total_papers
avg_benchmarks_per_paper = total_benchmark_mentions / total_papers
avg_models_when_present = total_model_mentions / papers_with_models if papers_with_models > 0 else 0
avg_benchmarks_when_present = total_benchmark_mentions / papers_with_benchmarks if papers_with_benchmarks > 0 else 0

print("\nPaper Statistics:")
print(f"Total papers analyzed: {total_papers}")
print(f"\nModel statistics:")
print(f"Total model mentions: {total_model_mentions}")
print(f"Papers that mention models: {papers_with_models} ({papers_with_models/total_papers*100:.1f}%)")
print(f"Average models per paper (across all papers): {avg_models_per_paper:.2f}")
print(f"Average models when present: {avg_models_when_present:.2f}")

print(f"\nBenchmark statistics:")
print(f"Total benchmark mentions: {total_benchmark_mentions}")
print(f"Papers that mention benchmarks: {papers_with_benchmarks} ({papers_with_benchmarks/total_papers*100:.1f}%)")
print(f"Average benchmarks per paper (across all papers): {avg_benchmarks_per_paper:.2f}")
print(f"Average benchmarks when present: {avg_benchmarks_when_present:.2f}")

# Create combined plots
create_combined_plot(df_benchmarks, top_20_benchmarks, benchmark_counter,
                    'Distribution of Benchmark Usage Over Time',
                    'benchmark_distribution.pdf', 0.5,
                    benchmark_original_names)

create_combined_plot(df_models, top_20_models, model_counter,
                    'Distribution of Model Usage Over Time',
                    'model_distribution.pdf', 0.9,
                    model_original_names)

# Print statistics
print("\nTop 20 Benchmarks and their frequencies:")
for benchmark, count in benchmark_counter.most_common(20):
    original_name = benchmark_original_names.get(benchmark, benchmark)
    print(f"{original_name}: {count}")

print("\nTop 20 Models and their frequencies:")
for model, count in model_counter.most_common(20):
    original_name = model_original_names.get(model, model)
    print(f"{original_name}: {count}")


Paper Statistics:
Total papers analyzed: 4865

Model statistics:
Total model mentions: 16629
Papers that mention models: 4760 (97.8%)
Average models per paper (across all papers): 3.42
Average models when present: 3.49

Benchmark statistics:
Total benchmark mentions: 14903
Papers that mention benchmarks: 4464 (91.8%)
Average benchmarks per paper (across all papers): 3.06
Average benchmarks when present: 3.34

Top 20 Benchmarks and their frequencies:
GSM8K: 425
MATH: 174
MMLU: 171
SVAMP: 139
StrategyQA: 126
HotpotQA: 120
HumanEval: 119
TruthfulQA: 81
CommonsenseQA: 77
HellaSwag: 70
AQUA: 64
TriviaQA: 63
MBPP: 63
MultiArith: 61
Winogrande: 56
BoolQ: 55
PiQA: 52
openbookqa: 49
ASDiv: 46
SQuAD: 44

Top 20 Models and their frequencies:
GPT-4: 1789
ChatGPT (gpt-3.5-turbo): 1739
GPT-3 (DaVinci-002): 705
PaLM-2: 415
GPT4o: 373
LLaMa 2 (7B parameters): 372
BERT: 266
Llama-3-8B: 264
Mistral 7B: 243
LLaMA-1: 230
LLaMA-7B: 201
LLAMA-2 (13b): 198
GPT-4V: 195
Llama3 (70B): 175
LLaMA 2 70B: 172
T5: 