In [3]:
! pip install pandas openml

/bin/bash: line 1: pip: command not found


In [1]:
import os
import pandas as pd
import openml

ModuleNotFoundError: No module named 'openml'

In [None]:
CSV_FILE = "./master_thesis-main/master_thesis-main/openml_suite_tasks.csv"
OUTPUT_FILE = "./master_thesis-main/master_thesis-main/tasks_to_rerun.csv"

In [None]:
print(f"--- Loading tasks from {CSV_FILE} ---")
try:
    df = pd.read_csv(CSV_FILE)
except FileNotFoundError:
    print(f"[ERROR] Could not find {CSV_FILE}. Make sure it is in the current folder.")
    return

# Get the unique dataset IDs to fetch metadata efficiently in one batch
unique_dids = df['Dataset ID'].unique().tolist()
print(f"Fetching metadata for {len(unique_dids)} unique datasets from OpenML...")

try:
    # list_datasets returns a dictionary or dataframe indexed by dataset ID
    # We request specific columns to minimize data transfer
    metadata = openml.datasets.list_datasets(
        data_id=unique_dids, 
        output_format='dataframe'
    )
except Exception as e:
    print(f"[ERROR] Failed to contact OpenML: {e}")
    print("Ensure you are running this on a node with internet access (e.g., login node).")
    return

print("\n" + "="*100)
print(f"{'SUITE':<6} | {'TASK':<8} | {'DATASET ID':<10} | {'NAME':<30} | {'SAMPLES':<8} | {'FEATS':<6} | {'REASON'}")
print("="*100)

rerun_list = []

for index, row in df.iterrows():
    did = row['Dataset ID']
    suite_id = row['Suite ID']
    task_id = row['Task ID']
    name = row['Dataset Name']
    
    if did in metadata.index:
        # Extract stats
        n_samples = metadata.loc[did, 'NumberOfInstances']
        n_features = metadata.loc[did, 'NumberOfFeatures']
        
        # Check thresholds
        reasons = []
        if n_samples > LIMIT_SAMPLES:
            reasons.append(f"Samples ({int(n_samples)})")
        if n_features > LIMIT_FEATURES:
            reasons.append(f"Feats ({int(n_features)})")
        
        # If any threshold is exceeded, add to report
        if reasons:
            reason_str = ", ".join(reasons)
            print(f"{suite_id:<6} | {task_id:<8} | {did:<10} | {name[:30]:<30} | {int(n_samples):<8} | {int(n_features):<6} | {reason_str}")
            
            # Save the full row + the statistics for the output CSV
            row_data = row.to_dict()
            row_data['Actual_Samples'] = int(n_samples)
            row_data['Actual_Features'] = int(n_features)
            row_data['Reason'] = reason_str
            rerun_list.append(row_data)
    else:
        print(f"{suite_id:<6} | {task_id:<8} | {did:<10} | {name[:30]:<30} | {'?':<8} | {'?':<6} | [WARN] Metadata not found")

print("="*100)
print(f"\n[SUMMARY] Found {len(rerun_list)} tasks that exceed your limits.")

if rerun_list:
    rerun_df = pd.DataFrame(rerun_list)
    rerun_df.to_csv(OUTPUT_FILE, index=False)
    print(f"[SUCCESS] Saved list of tasks to re-run to: {OUTPUT_FILE}")