# Gather Data Needed for ML4Jets Visualisation

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from urllib.parse import quote
from datetime import datetime, timedelta

In [3]:
def search_cds(params):
    """
    Search CERN Document Server (CDS) with raw query parameters
    """
    base_url = "https://cds.cern.ch/search"
    
    # Ensure we get JSON output if not specified
    if 'of' not in params:
        params['of'] = 'recjson'
    
    try:
        # Make request with raw params
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        
        # Try to parse just the array part
        text = response.text.strip()
        if text.startswith('['):
            # Replace concatenated arrays with comma-separated entries
            text = text.replace('}][{', '},{')
            
            try:
                return json.loads(text)
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON: {e}")
                return None
                
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        return None

def filter_recent_records(records, max_age_days=365):
    """
    Filter records to keep only those newer than max_age_days
    
    Parameters:
    -----------
    records : list
        List of CDS records
    max_age_days : int
        Maximum age in days
        
    Returns:
    --------
    list
        Filtered list of records
    """
    current_date = datetime.now()
    cutoff_date = current_date - timedelta(days=max_age_days)
    
    filtered_records = []
    for record in records:
        # Try to get creation date from different possible fields
        date_str = (record.get('creation_date') or 
                   record.get('imprint', {}).get('date') or 
                   record.get('prepublication', {}).get('date'))
        
        if date_str:
            try:
                # Handle different date formats
                if 'T' in date_str:  # ISO format like '2024-11-01T09:06:20'
                    record_date = datetime.fromisoformat(date_str)
                elif len(date_str) == 4:  # Just year
                    record_date = datetime.strptime(date_str, '%Y')
                else:  # Try common format 'DD Mon YYYY'
                    record_date = datetime.strptime(date_str, '%d %b %Y')
                
                if record_date >= cutoff_date:
                    filtered_records.append(record)
            except ValueError as e:
                print(f"Warning: Could not parse date {date_str}: {e}")
                continue
    
    return filtered_records


## ATLAS Data

### Collection

In [4]:
params = {
    'p1': 'machine learning',
    'f1': 'abstract',
    'rg': 100,
    'c': ['ATLAS Conference Notes', 'ATLAS Conference Slides', 'ATLAS PUB Notes', 'ATLAS Preprints']
}
results = search_cds(
        params,
    )

In [5]:
len(results)

126

In [6]:
recent_results = filter_recent_records(results)
len(recent_results)

32

In [7]:
# Print results
if recent_results:
    for record in recent_results:
        # Print basic information if available
        title = record.get('title', {}).get('title', 'No title')
        abstract = record.get('abstract', 'No abstract')
        date = record.get('prepublication', {}).get('date', 'No date')
        
        print(f"\nTitle: {title}")
        print(f"Date: {date}")
        print(f"Abstract: {abstract}")
        print("-" * 50)


Title: An implementation of Neural Simulation-Based Inference for Parameter Estimation in ATLAS
Date: 28 Oct 2024
Abstract: {'summary': 'Neural Simulation-Based Inference (NSBI) is a powerful class of machine learning (ML)-based methods for statistical inference that naturally handles high-dimensional parameter estimation without the need to bin data into low-dimensional summary histograms. Such methods are promising for a range of measurements, including at the Large Hadron Collider (LHC), where no single observable may be optimal to scan over the entire theoretical phase space under consideration, or where binning data into histograms could result in a loss of sensitivity. This work develops an NSBI framework for statistical inference, using neural networks to estimate probability density ratios, which enables the application of NSBI to a full-scale LHC analysis. It incorporates a large number of systematic uncertainties, quantifies the uncertainty coming from finite training statis

In [8]:
# Save to file

for record in recent_results:
    print(record["title"])
    try:
        record["title"] = record["title"]["title"]
        # Handle case where abstract is a list
        if isinstance(record["abstract"], list):
            record["abstract"] = record["abstract"][0]
        record["abstract"] = record["abstract"]["summary"]
    except:
        print(f"No abstract for {record['title']}")

{'title': 'An implementation of Neural Simulation-Based Inference for Parameter Estimation in ATLAS'}
{'title': 'Measurement of Track Functions in ATLAS Run 2 Data'}
{'title': 'Recent Advances in the GAN-based Fast Calorimeter Simulation of the ATLAS Experiment'}
{'title': 'Parameter Estimation in ATLAS with Neural Simulation-Based Inference'}
{'title': 'Towards Machine-Learning Particle Flow with the ATLAS Detector at the LHC'}
{'title': 'Improving Computational Performance of ATLAS GNN Track Reconstruction Pipeline'}
{'title': 'ATLAS EFT Results in the Top Quark Sector'}
{'title': 'AtlFast3: Fast Simulation in ATLAS for LHC Run 3 and beyond'}
{'title': 'Boosted Higgs decays to b-quarks in ATLAS'}
{'title': 'Performance versus uncertainty in boosted top tagging with the ATLAS detector'}
{'title': 'Flavour Tagging with Graph Neural Network with the ATLAS Detector'}
{'title': 'ATLAS searches for electroweak supersymmetry with compressed spectra'}
{'title': 'Flavour Tagging with Graph Ne

In [9]:
with open('../data/atlas_data.json', 'w') as f:
    json.dump(recent_results, f)

In [59]:
# Print results
if recent_results:
    for record in recent_results:
        # Print basic information if available
        title = record.get('title', 'No title')
        abstract = record.get('abstract', 'No abstract')
        date = record.get('prepublication', {}).get('date', 'No date')
        
        print(f"\nTitle: {title}")
        print(f"Date: {date}")
        print(f"Abstract: {abstract}")
        print("-" * 50)


Title: An implementation of Neural Simulation-Based Inference for Parameter Estimation in ATLAS
Date: 28 Oct 2024
Abstract: Neural Simulation-Based Inference (NSBI) is a powerful class of machine learning (ML)-based methods for statistical inference that naturally handles high-dimensional parameter estimation without the need to bin data into low-dimensional summary histograms. Such methods are promising for a range of measurements, including at the Large Hadron Collider (LHC), where no single observable may be optimal to scan over the entire theoretical phase space under consideration, or where binning data into histograms could result in a loss of sensitivity. This work develops an NSBI framework for statistical inference, using neural networks to estimate probability density ratios, which enables the application of NSBI to a full-scale LHC analysis. It incorporates a large number of systematic uncertainties, quantifies the uncertainty coming from finite training statistics, develop

## Combined Search Data

In [30]:
params = {
    'p1': 'machine learning',
    'f1': 'abstract',
    'rg': 100,
    'c': ['ATLAS', 'CMS', 'LHCb', 'ALICE']
}
results = search_cds(
        params,
    )

In [31]:
len(results)

271

In [32]:
recent_results = filter_recent_records(results)
len(recent_results)

129

In [26]:
# Print results
if recent_results:
    for record in recent_results:
        # Print basic information if available
        title = record.get('title', {}).get('title', 'No title')
        abstract = record.get('abstract', 'No abstract')
        date = record.get('prepublication', {}).get('date', 'No date')
        # If accelerator_experiment is a list, take the first element
        if isinstance(record.get('accelerator_experiment', {}), list):
            experiment = record.get('accelerator_experiment', {})[0].get('experiment', 'No experiment')
        else:
            experiment = record.get('accelerator_experiment', {}).get('experiment', 'No experiment')
        
        print(f"\nTitle: {title}")
        print(f"Date: {date}")
        print(f"Abstract: {abstract}")
        print(f"Experiment: {experiment}")
        print("-" * 50)


Title: Searches for New Physics with top quarks using the ATLAS detector
Date: 29 Oct 2024
Abstract: {'summary': 'Due to its large mass, the top quark plays a crucial role in probing the Standard Model of particle physics and beyond. The latest results from searches for new physics using top quark events collected with the ATLAS detector at the Large Hadron Collider are presented. In particular, the focus lies on scenarios beyond the Standard Model where the top quark provides is a powerful probe to answer open questions on Dark Matter and the light mass of the Higgs Boson. All searches presented utilize the full Run-2 dataset from the LHC, employing advanced techniques such as machine learning to provide the best sensitivity to new physics.'}
Experiment: ATLAS
--------------------------------------------------

Title: Recent Advances in the GAN-based Fast Calorimeter Simulation of the ATLAS Experiment
Date: 28 Oct 2024
Abstract: {'summary': 'Simulation of the detector response is a m

In [33]:
# Save to file

for record in recent_results:
    print(record["title"])
    try:
        record["title"] = record["title"]["title"]
        # Handle case where abstract is a list
        if isinstance(record["abstract"], list):
            record["abstract"] = record["abstract"][0]
        record["abstract"] = record["abstract"]["summary"]
        # If accelerator_experiment is a list, take the first element
        if isinstance(record.get('accelerator_experiment', {}), list):
            record["experiment"] = record.get('accelerator_experiment', {})[0].get('experiment', 'No experiment')
        else:
            record["experiment"] = record.get('accelerator_experiment', {}).get('experiment', 'No experiment')
    except:
        print(f"No abstract for {record['title']}")

{'title': 'Searches for New Physics with top quarks using the ATLAS detector'}
{'title': 'Recent Advances in the GAN-based Fast Calorimeter Simulation of the ATLAS Experiment'}
{'title': 'Parameter Estimation in ATLAS with Neural Simulation-Based Inference'}
{'title': 'Towards Machine-Learning Particle Flow with the ATLAS Detector at the LHC'}
{'title': 'Improving Computational Performance of ATLAS GNN Track Reconstruction Pipeline'}
{'title': 'ATLAS EFT Results in the Top Quark Sector'}
{'title': 'The ATLAS inner detector trigger performance in Run 3'}
{'title': 'An implementation of Neural Simulation-Based Inference for Parameter Estimation in ATLAS'}
{'title': 'Treatment of systematic uncertainties in $b$-jet identification and measurement of Higgs boson decays to $b$-quarks with the ATLAS detector'}
{'title': 'AtlFast3: Fast Simulation in ATLAS for LHC Run 3 and beyond'}
{'title': 'Computational Performance of the ATLAS ITk GNN Track Reconstruction Pipeline'}
{'title': 'Polarising 

In [34]:
with open('../data/lhc_combined_data.json', 'w') as f:
    json.dump(recent_results, f)