In [65]:
import os
import json
import csv


json_dir_path = "../scraped_paper_reviewer_assignment"  # Directory containing the JSON files
csv_file_path = "merged.csv"  # Path to the input CSV file
output_csv_file_path = "updated_merged.csv"  # Path to the output CSV file
log_file_path = "unmatched_papers.log" 

In [66]:
# Step 1: Load all JSON data from the directory
title_to_url = {}
for json_file_name in os.listdir(json_dir_path):
    if json_file_name.endswith(".json"):  # Only process JSON files
        json_file_path = os.path.join(json_dir_path, json_file_name)
        with open(json_file_path, 'r', encoding='utf-8') as json_file:
            try:
                papers_data = json.load(json_file)
                for paper in papers_data:
                    # Check for either "paper_url" or "link" field
                    url = paper.get("paper_url") or paper.get("link")
                    if url:
                        title_to_url[paper["title"]] = url
            except json.JSONDecodeError:
                print(f"Error decoding {json_file_name}, skipping.")


In [68]:
print("Total collected papers: ", len(title_to_url))
title_to_url


Total collected papers:  71


{'Croissant: A metadata format for ml-ready datasets': 'https://proceedings.neurips.cc/paper_files/paper/2024/hash/9547b09b722f2948ff3ddb5d86002bc0-Abstract-Datasets_and_Benchmarks_Track.html',
 'Tunetables: Context optimization for scalable prior-data fitted networks': 'https://proceedings.neurips.cc/paper_files/paper/2024/hash/97dc07f1253ab33ee514f395a82fa7cc-Abstract-Conference.html',
 'Genetic programming for feature selection based on feature removal impact in high-dimensional symbolic regression': 'https://ieeexplore.ieee.org/abstract/document/10466603/',
 'Dissecting sample hardness: A fine-grained analysis of hardness characterization methods for data-centric ai': 'https://arxiv.org/abs/2403.04551',
 'Deep learning through a telescoping lens: A simple model provides empirical insights on grokking, gradient boosting &amp; beyond': 'https://proceedings.neurips.cc/paper_files/paper/2024/hash/df334022279996b07e0870a629c18857-Abstract-Conference.html',
 'MLSea: a semantic layer for 

In [None]:
"""
Step 2:
Processes existing CSV file, updates each row by adding a "Paper URL" column, 
and tracks papers that do not have a matching URL. 
"""

updated_rows = []
unmatched_papers = []

with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file)
    header = next(csv_reader)  # Extract the header
    if "Paper URL" not in header:
        header.append("Paper URL")  # Add a new column for URLs if not present

    for row in csv_reader:
        paper_title = row[1].strip()  # Assuming the first column contains paper titles
        
        if paper_title in title_to_url:
            row.append(title_to_url[paper_title])  # Add the URL to the row
        else:
            row.append("")  # Add an empty URL if not found
            unmatched_papers.append(paper_title)  # Log the unmatched paper
        updated_rows.append(row)

In [None]:
# Step 3 : Write the updated rows to the output CSV file
with open(output_csv_file_path, 'w', encoding='utf-8', newline='') as output_csv:
    csv_writer = csv.writer(output_csv)
    csv_writer.writerow(header)  # Write the header
    csv_writer.writerows(updated_rows)  # Write the updated rows

# Step 4: Write the unmatched papers to a log file
with open(log_file_path, 'w', encoding='utf-8') as log_file:
    log_file.write("Unmatched Papers:\n")
    for paper in unmatched_papers:
        log_file.write(f"{paper}\n")

print(f"Processing complete. Updated CSV saved to {output_csv_file_path}, unmatched papers logged to {log_file_path}.")

In [None]:
# Step 5: Assign Reviewers
import csv

# List of reviewers
reviewers = [] #Names removed for anonymity 

# File paths
input_csv_file = "updated_merged.csv"  # Input CSV file with 1719 papers
output_csv_file = "updated_merged_reviewers.csv"  # Output CSV file with assignments
unassigned_csv_file = "unassigned_papers.csv"  # Output CSV file for unassigned papers

# Read the list of papers
with open(input_csv_file, 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.reader(csv_file)
    header = next(csv_reader)
    papers = [row for row in csv_reader]

# Assign papers to reviewers
reviewer_assignments = {reviewer: [] for reviewer in reviewers}
unassigned_papers = []

# Total papers per reviewer
papers_per_reviewer = 100
total_reviewers = len(reviewers)
total_papers = len(papers)

# Assign papers to reviewers, ensuring all papers are accounted for
for idx, paper in enumerate(papers):
    if idx < total_reviewers * papers_per_reviewer:
        # Assign to a reviewer in a round-robin fashion
        reviewer = reviewers[idx // papers_per_reviewer]
        reviewer_assignments[reviewer].append(paper + [reviewer])  # Add reviewer to the paper
    else:
        unassigned_papers.append(paper)  # Papers left unassigned

# Write assignments to the output CSV file, including all papers
with open(output_csv_file, 'w', encoding='utf-8', newline='') as output_csv:
    csv_writer = csv.writer(output_csv)
    csv_writer.writerow(header + ["Reviewer"])  # Add reviewer column to the header
    for reviewer, papers in reviewer_assignments.items():
        csv_writer.writerows(papers)

    # Add the unassigned papers to the output CSV (without reviewers)
    for paper in unassigned_papers:
        csv_writer.writerow(paper + [''])  # Add empty reviewer field for unassigned papers

# Write unassigned papers to a separate CSV file (optional)
with open(unassigned_csv_file, 'w', encoding='utf-8', newline='') as unassigned_csv:
    csv_writer = csv.writer(unassigned_csv)
    csv_writer.writerow(header)  # Use the original header
    csv_writer.writerows(unassigned_papers)

print(f"Assignments complete. Papers with reviewers saved to '{output_csv_file}', and unassigned papers saved to '{unassigned_csv_file}'.")


Assignments complete. Papers with reviewers saved to 'updated_merged_reviewers.csv', and unassigned papers saved to 'unassigned_papers.csv'.


Extra Step:

Additional 71 papers were collected in March 2025. These papers were accepted in 2024, but were not yet published when the data was collected the first time. 
The collected papers cite Openml-2014 paper. 

In [None]:
"""
Addition of the Last 71 papers, which are extracted in March 2025 
Contains papers accepted in 2024, but published afterwards in early 2025.
"""

import csv
import os

collected_data_csv = "../data/collected_papers.csv"

# Read existing CSV and find max Paper ID
existing_titles = set()
updated_rows = []

if os.path.exists(collected_data_csv):
    with open(collected_data_csv, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file)
        header = next(csv_reader)
        updated_rows.append(header)

        paper_id_index = header.index("Paper ID")
        title_index = header.index("Title")
        max_paper_id = 0

        for row in csv_reader:
            updated_rows.append(row)
            existing_titles.add(row[title_index].strip())
            max_paper_id = max(max_paper_id, int(row[paper_id_index]))  # Track max Paper ID

else:
    print(f"{collected_data_csv} not found. Creating a new file.")
    header = ["Status", "Paper ID", "Title", "openml-suites-2021", "openml-python-2021",
              "openml-2014", "openml-r-2017", "paper URL"]
    updated_rows.append(header)
    max_paper_id = 0

print(max_paper_id)
# Add missing papers from title_to_url
new_paper_id = max_paper_id + 1
for title, url in title_to_url.items():
    if title not in existing_titles:
        updated_rows.append(["FALSE", new_paper_id, title, "FALSE", "FALSE", "TRUE", "FALSE", url])
        new_paper_id += 1

# Write the updated data back to collected_data.csv
with open(collected_data_csv, 'w', encoding='utf-8', newline='') as csv_file:
    csv.writer(csv_file).writerows(updated_rows)

print(f"Updated {collected_data_csv} successfully. Added {new_paper_id - max_paper_id - 1} new papers.")


1720
Updated ../data/collected_papers.csv successfully. Added 67 new papers.
