# OpenML Impact Analysis

In [None]:
!pip install -r ../requirements.txt --quiet

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

## Data cleaning and preprocessing

In [None]:
# Load survey data

path1 = '../data/collected_papers.csv' # Original list of collected papers
path2 = "../data/Final_survey_data.csv" # Survey Results

df = pd.read_csv(path2)

print("Total no of reviews recieved: ", len(df))
df.head()

In [None]:
pd.DataFrame(df.columns)

### Sanity Check:
1. Find dublicates and remove duplicates (matched using Paper ID column).
2. Match paper ID and paper title from original list.
3. Remove papers with wrong year (<2014) 
4. Exclude papers from year =2025. 
5. Remove empty rows

In [None]:
Total_papers = pd.read_csv(path1) # Original list of scraped paper
print("Total number of scraped papers: ", len(Total_papers))

In [None]:
# Papers which not available or behind paywall
unvailable_papers = len(Total_papers) - len(df)
print(unvailable_papers)

In [None]:
# Create a dictionary mapping Paper ID to Title from the original list. 
assigned_papers_dict = Total_papers[["Paper ID", "Title"]].set_index('Paper ID').to_dict()
assigned_papers_dict = assigned_papers_dict["Title"]
assigned_papers_dict

In [None]:
sheet1 = Total_papers[["Paper ID", "openml-suites-2021", "openml-python-2021", "openml-2014", "openml-r-2017"]]
sheet1.rename(columns={"Paper ID": "Paper ID (from shared sheet)"}, inplace=True)
df = df.merge(sheet1, on="Paper ID (from shared sheet)", how="left")
df.head()

In [None]:
# Step 1: Remove Dublicates

# Duplicate entries
duplicate_rows = df[df.duplicated(subset=["Paper ID (from shared sheet)"], keep=False)]

print("No. of duplicates:", len(duplicate_rows))
duplicate_rows

In [None]:
duplicates = df[df.duplicated(subset=["Paper ID (from shared sheet)"], keep=False)]
# Save duplicates to a separate DataFrame before removal
duplicates_removed = df[df.duplicated(subset=["Paper ID (from shared sheet)"], keep="first")]

df = df.drop_duplicates(subset=["Paper ID (from shared sheet)"], keep="first")

print("No of duplicates removed: ", len(duplicates_removed))
duplicates_removed

In [None]:
print("No of reviews after duplicate removal: ", len(df))

In [None]:
# Step 2: Remove mismatches
# All entries where the paper title and paper ID in the survey does not match the title in the original list
# Create a DataFrame to store mismatches

mismatches = []

for index, row in df.iterrows():
    try:
        paper_id = row["Paper ID (from shared sheet)"]
        paper_title = row["Paper Title"].strip().lower()
        assigned_title = assigned_papers_dict.get(paper_id).strip().lower()

        if paper_title not in assigned_title:
            mismatches.append({
                "Paper ID": paper_id,
                "Given Title": row["Paper Title"],
                "Expected Title": assigned_papers_dict.get(paper_id, "Not Found")
            })

    except Exception as e:
        mismatches.append({
            "Paper ID": row["Paper ID (from shared sheet)"],
            "Error": str(e),
            "Given Title": row.get("Paper Title", "Not Found")
        })
        continue

mismatches_df = pd.DataFrame(mismatches)
mismatches_df



In [None]:
# No Mismatches between the paper title and paper ID

In [None]:
print("No of reviews after mismatch entry removal: ", len(df))

In [None]:
# Step 5: Drop rows with all NaN or NaT values
print(len(df))
df = df.dropna(how="all")

print("no of reviews after empty row removal:", len(df))

In [None]:
# Step 3 and 4: Remove papers with a year before 2014 and after 2024.
# Paper year before 2014 is an incorrect entry. 
# Paper year after 2024 are not considered in the survey.

# Convert to numeric, invalid strings become NaN
df["Paper Year"] = pd.to_numeric(df["Paper Year"], errors='coerce')

# Drop rows with NaN in 'Paper Year'
df = df[df["Paper Year"].notna()]


# Convert to integer 
df["Paper Year"] = df["Paper Year"].astype(int)
print(len(df))

# # Check the updated column type
# print(df["Paper Year"].dtypes)

unvailable_papers = unvailable_papers + (1770-len(df))

# pepers with NaN or invalid entry type (not int) are unvalible (or being paywall)
print("unvailable_papers:", unvailable_papers)

In [None]:
# Print entries with year before 2014 or after 2024

out_of_range_2014 = df[(df["Paper Year"] < 2014)]

print("Papers with year before 2014:", len(out_of_range_2014))


out_of_range_2024 = df[(df["Paper Year"] >2024)]
print("Papers with year after 2025:", len(out_of_range_2024))
# print("Papers with year after 2025:", out_of_range_2024)

no_of_papers_before_out_of_year_removal = len(df)

# Remove paper year before 2014 and after 2024
df = df[(df["Paper Year"] >= 2014) & (df["Paper Year"] <= 2024)]
no_of_papers_after_out_of_year_removal = len(df)

print("No of papers after removal of papers (before 2014 and after 2025):", len(df))

In [None]:
print("Papers removed (wrong year or after 2024): ", no_of_papers_before_out_of_year_removal - no_of_papers_after_out_of_year_removal)

In [None]:
df["Paper Available"].unique()

In [None]:
df["Paper Available"] = df["Paper Available"].astype(str).str.lower()

count_yes = df["Paper Available"].value_counts().get("yes", 0)
print(count_yes)
print(len(df)-count_yes)

unvailable_papers = unvailable_papers + (len(df)-count_yes)
print("Total no of unvailable_papers:", unvailable_papers)


In [None]:
df["Paper in English"] = df["Paper in English"].astype(str).str.lower()

count_yes = df["Paper in English"].value_counts().get("yes", 0)
print(count_yes)
print(len(df)-count_yes)

In [None]:
# No fo paper which are both not (completely) available and not in english
overlap = df[(df["Paper Available"] != "yes") & (df["Paper in English"] != "yes")]
print(len(overlap))

In [None]:
# Filter rows not in final_df
not_in_final_df = df[~((df["Paper Available"] == "yes") &  (df["Paper in English"] == "yes"))]

print(len(not_in_final_df))
# Display the rows not in final_df
not_in_final_df[["Paper Available", "Paper in English"]]


In [None]:
# Should only consider paper available + in english
available_papers = df[(df["Paper Available"] == "yes") & (df["Paper in English"] == "yes")].copy()
print("Final number of papers for analysis: ", len(available_papers))

In [None]:
# convert all columns of str type to lower case

# Identify string columns
str_cols = available_papers.select_dtypes(include=["object", "string"]).columns
# print(str_cols)

# Convert all string columns to lowercase
for col in str_cols:
    available_papers[col] = available_papers[col].astype(str).str.lower()
    
available_papers.head()


Final statistics

1. Total exctracted paper: 1786
2. Paper used in analysis: 1528


## Analysis

In [None]:

columns_to_analyze = ["openml-suites-2021", "openml-python-2021", "openml-2014", "openml-r-2017"]

# Calculate the percentage of True values for each column
percentages = available_papers[columns_to_analyze].mean() * 100

# Print the results
for column, percentage in percentages.items():
    print(f"Percentage of papers with True for {column}: {percentage:.2f}%")


In [None]:

# Count the number of papers per year
papers_per_year = available_papers["Paper Year"].value_counts().sort_index()

plt.figure(figsize=(10, 5))
plt.bar(papers_per_year.index, papers_per_year.values, color="skyblue", edgecolor="black")

plt.xlabel("Year")
plt.ylabel("Number of Papers")
plt.title("Number of OpenML Cited Papers Per Year")
plt.xticks(rotation=45)
plt.grid(linestyle='--', alpha=0.2)

plt.show()


In [None]:
# What percentage of the papers does just cite OpenML, but not actively interact with it?

interaction_columns = [
    "Does the paper use datasets from OpenML?",
    "Does the paper use a collection (at least 2 or more) of datasets that are defined by OpenML designated to do benchmarking (e.g., openml benchmarking suites)?",
    "Does the paper use OpenML experiment data (i.e., utilise results from runs)? ",
    "Does the paper upload datasets to OpenML?",
    "Does the paper upload experiment data to OpenML?",
    "Does the paper interact in any other way with OpenML?"
]

just_citing_papers = available_papers[(available_papers[interaction_columns] == "no").all(axis=1)]

percentage_citing_only = (len(just_citing_papers) / len(available_papers)) * 100
print(len(just_citing_papers), round(percentage_citing_only, 2))

core_citing_papers = just_citing_papers[just_citing_papers["Does the paper have at least 1 current OpenML Core Member as co-author?"] == "yes"]
print(len(core_citing_papers), round((len(core_citing_papers)/len(just_citing_papers))*100,2))


In [None]:
# out of above papers which just cites openml, % of paper citing Openml-2014 paper

just_citing_openml_2014 = just_citing_papers[just_citing_papers["openml-2014"] == True]
print(len(just_citing_openml_2014), (len(just_citing_openml_2014) / len(just_citing_papers)) * 100)


In [None]:
english_counts = df["Paper in English"].value_counts()

plt.figure(figsize=(18, 4))

plt.subplot(1, 2, 1)
plt.pie(english_counts, labels=english_counts.index, autopct="%1.1f%%")
plt.title("Papers in English")


core_member_counts = available_papers["Does the paper have at least 1 current OpenML Core Member as co-author?"].value_counts()

plt.subplot(1, 2, 2)
plt.pie(core_member_counts, labels=core_member_counts.index, autopct="%1.1f%%")
plt.title("Papers with OpenML Core Member Co-author")

plt.tight_layout()
plt.show()

In [None]:
# Datasets

dataset_papers = available_papers[available_papers["Does the paper use datasets from OpenML?"] == "yes"]
dataset_percentage = (len(dataset_papers) / len(available_papers)) * 100
print(dataset_percentage, (len(dataset_papers)))
print("Number of paper by core-authors: ",len(dataset_papers[dataset_papers["Does the paper have at least 1 current OpenML Core Member as co-author?"] == "yes"]))

unclear_dataset_papers = available_papers.dropna(subset=["If unclear, one sentence explanation?"])
unclear_dataset_papers = unclear_dataset_papers[["Does the paper use datasets from OpenML?", "If unclear, one sentence explanation?"]]
unclear_dataset_papers

In [None]:
# Benchmark
benchmark_papers = available_papers[available_papers["Does the paper use a collection (at least 2 or more) of datasets that are defined by OpenML designated to do benchmarking (e.g., openml benchmarking suites)?"] == "yes"]
benchmark_percentage = (len(benchmark_papers) / len(available_papers[available_papers["Paper Year"] >= 2017])) * 100
print(benchmark_percentage, len(benchmark_papers))

benchmark_datasets = benchmark_papers["If yes, which benchmark suites?"]

print("Number of paper by core-authors: ",len(benchmark_papers[benchmark_papers["Does the paper have at least 1 current OpenML Core Member as co-author?"] == "yes"]))
# Print list of benchmark datasets
unique_benchmark_datasets = (
    benchmark_datasets.str.lower().str.strip().unique()
)


unique_benchmark_datasets

In [None]:
#  Experiment data (Runs)

experiment_papers = available_papers[available_papers["Does the paper use OpenML experiment data (i.e., utilise results from runs)? "] == "yes"]
experiment_percentage = (len(experiment_papers) / len(available_papers)) * 100

print(len(experiment_papers) ,experiment_percentage)
print("Number of paper by core-authors: ",len(experiment_papers[experiment_papers["Does the paper have at least 1 current OpenML Core Member as co-author?"] == "yes"]))


experiment_papers[["Paper ID (from shared sheet)", "if yes: short (e.g., 1 sentence) explanation: how does it use this data?"]]


In [None]:
upload_datasets = available_papers[available_papers["Does the paper upload datasets to OpenML?"] == "yes"]

upload_datasets_percentage = len(upload_datasets) / len(available_papers) * 100

print( len(upload_datasets), upload_datasets_percentage)
print("Number of paper by core-authors: ",len(upload_datasets[upload_datasets["Does the paper have at least 1 current OpenML Core Member as co-author?"] == "yes"]))


upload_datasets[["Paper ID (from shared sheet)", "Does the paper upload datasets to OpenML?", "If yes, which dataset?"]]

In [None]:
upload_experiment_data_papers = available_papers[available_papers["Does the paper upload experiment data to OpenML?"] == "yes"]

upload_experiment_data_percentage = len(upload_experiment_data_papers) / len(available_papers) * 100

print(len(upload_experiment_data_papers),upload_experiment_data_percentage)
print("Number of paper by core-authors: ",len(upload_experiment_data_papers[upload_experiment_data_papers["Does the paper have at least 1 current OpenML Core Member as co-author?"] == "yes"]))

upload_experiment_data_papers[["Paper ID (from shared sheet)", "if yes: short (e.g., 1 sentence) explanation: what type of experiments?"]]

In [None]:
# paper year not mentioned.
available_papers[available_papers["Paper Year"].isna()]

In [None]:
# available_papers["Paper Year"] = available_papers["Paper Year"].astype(int)

available_papers["Datasets Used"] = available_papers["Does the paper use datasets from OpenML?"] == "yes"
available_papers["Benchmark Used"] = available_papers["Does the paper use a collection (at least 2 or more) of datasets that are defined by OpenML designated to do benchmarking (e.g., openml benchmarking suites)?"] == "yes"

# Group data by year for visualization
datasets_by_year = available_papers.groupby("Paper Year")["Datasets Used"].sum()
benchmarks_by_year = available_papers.groupby("Paper Year")["Benchmark Used"].sum()

# Filter out the year 2025
datasets_by_year_filtered = datasets_by_year[datasets_by_year.index != 2025]
benchmarks_by_year_filtered = benchmarks_by_year[benchmarks_by_year.index != 2025]

# Font sizes
LABEL_SIZE, TITLE_SIZE, NUMBER_SIZE = 14, 20, 14

def add_data_labels(ax, years, values, color):
    """Helper function to add data labels with consistent formatting"""
    for year, value in zip(years, values):
        if year >= 2023:
            va_alignment = 'top'
            y_offset, x_offset = -7, 0.3
        else:
            va_alignment = 'bottom'
            y_offset, x_offset = 0, 0

        ax.text(year + x_offset, value + y_offset, str(value),
                color=color, fontsize=NUMBER_SIZE,
                ha="right", va=va_alignment)

# Create figure and set padding
fig, ax = plt.subplots(figsize=(12, 6))
y_max = max(datasets_by_year_filtered.max(), benchmarks_by_year_filtered.max())
padding_factor = 1.05
ax.set_ylim(-(y_max * padding_factor - y_max), y_max * padding_factor)

# Define and add annotations
annotations = {
    2016: {"text": "OpenML R\nPackage on\nCRAN", "y_pos": 50},
    2017: {"text": "Preprint of OpenML\nBenchmarking Suites\nPaper, OpenML R Paper", "y_pos": 100},
    2018: {"text": "openml-python\non PyPI", "y_pos": 150},
    2019: {"text": "First AutoML\nBenchmark Paper", "y_pos": 175},
    2021: {"text": "NeurIPS Paper on OpenML\nBenchmarking Suites, OpenML\nPython Paper", "y_pos": 50},
    2023: {"text": "mlr3oml R Package\non CRAN", "y_pos": 100}
}

for year, anno_dict in annotations.items():
    y_pos = anno_dict["y_pos"]
    ax.vlines(x=year, ymin=ax.get_ylim()[0], ymax=y_pos, colors='black', alpha=.3)
    ax.hlines(y=y_pos, xmin=year-0.1, xmax=year+0.1, colors='black', alpha=.3)
    ax.text(year, y_pos + 5, anno_dict["text"],
            ha='center', va='bottom', fontsize=12,
            color='black', alpha=.6)

# Plot lines and add data labels
for data, color, label in [
    (datasets_by_year_filtered, 'blue', "Papers Using OpenML Datasets"),
    (benchmarks_by_year_filtered, 'red', "Papers Using OpenML Benchmarking Suites")
]:
    ax.plot(data.index, data.values, color=color, marker="o", label=label)
    add_data_labels(ax, data.index, data.values, color)

# Style the axes
ax.set_xticks(range(min(datasets_by_year.index), max(datasets_by_year.index)))
ax.set_xticklabels(range(min(datasets_by_year.index), max(datasets_by_year.index)),
                   rotation=-45)
ax.tick_params(axis='both', which='major', labelsize=LABEL_SIZE, width=1.5)
for spine in ax.spines.values():
    spine.set_linewidth(1.5)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Add labels and grid
ax.set_xlabel("Year", fontsize=18)
ax.set_ylabel("Number of Papers", fontsize=18)
ax.set_title("Datasets and Benchmarks Used by Papers per Year", fontsize=TITLE_SIZE)
ax.legend(fontsize=14)
#ax.grid(True, linestyle="--", alpha=0.6)

plt.tight_layout()
plt.show()

# Save figure
with PdfPages("Datasets_and_Benchmark_per_year.pdf") as fh:
    fh.savefig(fig, bbox_inches="tight")

In [None]:
other_interactions = available_papers[available_papers["Does the paper interact in any other way with OpenML?"] == "yes"]

other_interactions_percentage = len(other_interactions) / len(available_papers) * 100

print(len(other_interactions) , other_interactions_percentage)
print("Number of paper by core-authors: ",len(other_interactions[other_interactions["Does the paper have at least 1 current OpenML Core Member as co-author?"] == "yes"]))


other_interactions[["Does the paper interact in any other way with OpenML?", "if yes: short (e.g., 1 sentence) explanation how?"]]

In [None]:
# Unique papers which are starred for special mention
starred_papers = available_papers[available_papers["Star it as some cool project to be showcased in our paper?"] == "yes"]

print(len(starred_papers))

starred_papers[["Paper ID (from shared sheet)", "If yes, please motivate your answer"]]


In [None]:
# output_file = "starred_papers.csv"
# starred_papers.to_csv(output_file, index=False)

In [None]:
thesis_papers = available_papers[available_papers["Is the paper a thesis (Bachelor's, Master's, or PhD)?"] == "yes"]

# Also, check for any occurrence of the word 'thesis' in the "Optional short description"
thesis_keyword_papers = available_papers[available_papers["Optional short description"].str.contains("thesis", case=False, na=False)]
all_thesis_papers = pd.concat([thesis_papers, thesis_keyword_papers]).drop_duplicates()


thesis_percentage = len(all_thesis_papers) / len(available_papers) * 100
print("Number of thesis supervised by core-authors: ",len(all_thesis_papers[all_thesis_papers["Does the paper have at least 1 current OpenML Core Member as co-author?"] == "yes"]))

print(len(all_thesis_papers), thesis_percentage)
