In [1]:
%pip install pdfminer.six
import requests
import pandas as pd
import time
import os
import re
import tempfile
from pdfminer.high_level import extract_text

Collecting pdfminer.six
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250327


In [2]:
base_url = "https://paperswithcode.com/api/v1/papers/"
items_per_page = 500
total_items = 20000
num_pages = total_items // items_per_page  # This will be 20 pages

all_results = []

for page in range(1, num_pages + 1):
    url = f"{base_url}?page={page}&items_per_page={items_per_page}&q=machine%20learning"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        data = response.json()       # Parse JSON response

        # Extend our list if the 'results' key is available
        if "results" in data:
            results = data["results"]
            all_results.extend(results)
            print(f"Page {page} processed, retrieved {len(results)} items.")
        else:
            print(f"Page {page} returned no 'results' key.")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred on page {page}: {e}")

print(f"Total items collected: {len(all_results)}")


Page 1 processed, retrieved 500 items.
Page 2 processed, retrieved 500 items.
Page 3 processed, retrieved 500 items.
Page 4 processed, retrieved 500 items.
Page 5 processed, retrieved 500 items.
Page 6 processed, retrieved 500 items.
Page 7 processed, retrieved 500 items.
Page 8 processed, retrieved 500 items.
Page 9 processed, retrieved 500 items.
Page 10 processed, retrieved 500 items.
Page 11 processed, retrieved 500 items.
Page 12 processed, retrieved 500 items.
Page 13 processed, retrieved 500 items.
Page 14 processed, retrieved 500 items.
Page 15 processed, retrieved 500 items.
Page 16 processed, retrieved 500 items.
Page 17 processed, retrieved 500 items.
Page 18 processed, retrieved 500 items.
Page 19 processed, retrieved 500 items.
Page 20 processed, retrieved 500 items.
Page 21 processed, retrieved 500 items.
Page 22 processed, retrieved 500 items.
Page 23 processed, retrieved 500 items.
Page 24 processed, retrieved 500 items.
Page 25 processed, retrieved 500 items.
Page 26 p

In [3]:
df = pd.DataFrame(all_results)
df = df[["id", "title", "abstract"]]
df.to_csv("paper_abstracts.csv", index=False)

# Display some information about the DataFrame
print(f"\nTotal items collected: {len(df)}")
print("\nData preview:")
df.head()


Total items collected: 20000

Data preview:


Unnamed: 0,id,title,abstract
0,automated-bridge-component-recognition-using,Automated Bridge Component Recognition using V...,This paper investigates the automated recognit...
1,flexible-collaborative-estimation-of-the,Robust inference on the average treatment effe...,Many estimators of the average effect of a tre...
2,consistent-individualized-feature-attribution,Consistent Individualized Feature Attribution ...,A unified approach to explain the output of an...
3,on-enhancing-speech-emotion-recognition-using,On Enhancing Speech Emotion Recognition using ...,Generative Adversarial Networks (GANs) have ga...
4,evaluating-and-characterizing-incremental,Evaluating and Characterizing Incremental Lear...,Incremental learning from non-stationary data ...


In [None]:
paper_ids = df['id'].tolist()

paper_methods_data = []

for paper_id in paper_ids:
    url = f"https://paperswithcode.com/api/v1/papers/{paper_id}/methods/"
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        methods_list = data.get("results", [])
        # Extract method names if available; adjust the key if needed
        method_names = [method.get("name", "N/A") for method in methods_list]
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving methods for paper {paper_id}: {e}")
        method_names = []

    # Save the paper id and its methods (as a comma-separated string)
    paper_methods_data.append({
        "paper_id": paper_id,
        "methods": ", ".join(method_names)
    })

    # Brief pause to be polite to the API server

In [None]:
df_paper_methods = pd.DataFrame(paper_methods_data)
df_paper_methods

Unnamed: 0,paper_id,methods
0,automated-bridge-component-recognition-using,
1,flexible-collaborative-estimation-of-the,
2,consistent-individualized-feature-attribution,
3,on-enhancing-speech-emotion-recognition-using,"GAN, Convolution"
4,evaluating-and-characterizing-incremental,
...,...,...
19995,non-compliance-and-missing-data-in-health,
19996,model-less-active-compliance-for-continuum,
19997,extracting-vehicle-sensor-signals-from-can,
19998,rans-turbulence-model-development-using-cfd,


In [None]:
df_paper_methods = df_paper_methods[df_paper_methods['methods'].str.strip() != '']
df_paper_methods

Unnamed: 0,paper_id,methods
3,on-enhancing-speech-emotion-recognition-using,"GAN, Convolution"
5,snap-ml-a-hierarchical-framework-for-machine,Logistic Regression
9,a-novel-hybrid-machine-learning-model-for,SVM
10,feature-learning-and-classification-in,PCA
13,laplacian-smoothing-gradient-descent,Logistic Regression
...,...,...
18115,fit-a-fast-and-accurate-framework-for-solving,AutoEncoder
18128,improving-students-performance-in-small-scale,Dropout
19387,improving-protein-gamma-turn-prediction-using,Capsule Network
19607,capsule-deep-neural-network-for-recognition,Capsule Network


In [None]:
df_paper_methods.to_csv("output.csv", index=False)

In [None]:
# Define a mapping from various token forms to a standardized learning algorithm name.
# This mapping includes only learning algorithms and excludes supporting steps such as PCA, ICA, and SOM.
model_aliases = {
    "support vector machine": "SVM",
    "svm": "SVM",
    "convolutional neural network": "CNN",
    "cnn": "CNN",
    "recurrent neural network": "RNN",
    "rnn": "RNN",
    "long short-term memory": "LSTM",
    "lstm": "LSTM",
    "gated recurrent unit": "GRU",
    "gru": "GRU",
    "transformer": "Transformer",
    "bert": "BERT",
    "gpt": "GPT",
    "random forest": "Random Forest",
    "decision tree": "Decision Tree",
    "gradient boosting": "Gradient Boosting",
    "xgboost": "XGBoost",
    "lightgbm": "LightGBM",
    "catboost": "CatBoost",
    "logistic regression": "Logistic Regression",
    "linear regression": "Linear Regression",
    "naive bayes": "Naive Bayes",
    "k-nearest neighbors": "KNN",
    "knn": "KNN",
    "autoencoder": "Autoencoder",
    "deep belief network": "Deep Belief Network",
    "dbn": "Deep Belief Network",
    "multilayer perceptron": "MLP",
    "mlp": "MLP",
    "adaboost": "AdaBoost",
    "bagging": "Bagging",
    "ensemble": "Ensemble",
    "gaussian process": "Gaussian Process",
    "reinforcement learning": "Reinforcement Learning",
    "q-learning": "Q-Learning",
    "policy gradient": "Policy Gradient",
    "actor critic": "Actor Critic",
    "genetic algorithm": "Genetic Algorithm",
    "evolutionary algorithm": "Evolutionary Algorithm",
    "deep reinforcement learning": "Deep Reinforcement Learning",
    "autoencoder": "Autoencoder",
    "denoising autoencoder": "Autoencoder",
    "sparse autoencoder": "Autoencoder",
    "alexnet": "AlexNet",
    "albert": "ALBERT",
    "bilstm": "BiLSTM",
    "bigru": "BiGRU",
    "biggan": "BigGAN",
    "biggan-deep": "BigGAN-Deep",
    "capsule network": "Capsule Network",
    "chexnet": "CheXNet",
    "convlstm": "ConvLSTM",
    "dcgan": "DCGAN",
    "dcnn": "DCNN",
    "densenet": "DenseNet",
    "efficientnet": "EfficientNet",
    "electra": "ELECTRA",
    "elmo": "ELMo",
    "faster r-cnn": "Faster R-CNN",
    "gan": "GAN",
    "gat": "GAT",
    "gcn": "GCN",
    "gin": "GIN",
    "googlenet": "GoogleNet",
    "graph neural network": "GNN",
    "graphsage": "GraphSAGE",
    "lenet": "LeNet",
    "mobilenetv1": "MobileNetV1",
    "mobilenetv2": "MobileNetV2",
    "resnet": "ResNet",
    "resnext": "ResNeXt",
    "retinanet": "RetinaNet",
    "roberta": "RoBERTa",
    "sdne": "SDNE",
    "segnet": "SegNet",
    "seq2seq": "Seq2Seq",
    "squeezenet": "SqueezeNet",
    "stylegan": "StyleGAN",
    "stylegan2": "StyleGAN2",
    "t5": "T5",
    "transformer-xl": "Transformer-XL",
    "unet": "U-Net",
    "unet++": "U-Net++",
    "vae": "VAE",
    "vgg": "VGG",
    "vgg-19": "VGG-19",
    "xception": "Xception",
    "xlm": "XLM",
    "xlnet": "XLNet",
    "yolov1": "YOLOv1",
    "yolov3": "YOLOv3",
    "yolov4": "YOLOv4",
    "zfnet": "ZFNet"
}

def extract_learning_algorithms(methods_str):
    """
    Extracts and standardizes learning algorithms from a methods string based on the model_aliases mapping.
    Returns a list of unique standardized algorithm names.
    """
    if pd.isnull(methods_str):
        return []
    methods_lower = methods_str.lower()
    found = set()
    for token, normalized in model_aliases.items():
        if token in methods_lower:
            found.add(normalized)
    return list(found)

# Apply the extraction function to the "methods" column
df_paper_methods['extracted_models'] = df_paper_methods['methods'].apply(extract_learning_algorithms)

# Keep only rows where at least one learning algorithm was identified
df_paper_methods = df_paper_methods[df_paper_methods['extracted_models'].apply(lambda x: len(x) > 0)].copy()

# Expand the list of extracted models into separate columns (model1, model2, etc.)
models_expanded = df_paper_methods['extracted_models'].apply(pd.Series)
models_expanded = models_expanded.rename(columns=lambda x: f"model{x+1}")

# Combine the original DataFrame (dropping the helper column) with the expanded model columns
df_final = pd.concat([df_paper_methods.drop(columns=["extracted_models"]), models_expanded], axis=1)

df_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_paper_methods['extracted_models'] = df_paper_methods['methods'].apply(extract_learning_algorithms)


Unnamed: 0,paper_id,methods,model1,model2,model3,model4,model5,model6
3,on-enhancing-speech-emotion-recognition-using,"GAN, Convolution",GAN,,,,,
5,snap-ml-a-hierarchical-framework-for-machine,Logistic Regression,Logistic Regression,,,,,
9,a-novel-hybrid-machine-learning-model-for,SVM,SVM,,,,,
13,laplacian-smoothing-gradient-descent,Logistic Regression,Logistic Regression,,,,,
73,examining-the-use-of-neural-networks-for,SVM,SVM,,,,,
...,...,...,...,...,...,...,...,...
18085,a-role-for-prior-knowledge-in-statistical,"Logistic Regression, SVM, Feature Selection",Logistic Regression,SVM,,,,
18087,using-machine-learning-to-calibrate-storm,Logistic Regression,Logistic Regression,,,,,
18115,fit-a-fast-and-accurate-framework-for-solving,AutoEncoder,Autoencoder,,,,,
19387,improving-protein-gamma-turn-prediction-using,Capsule Network,Capsule Network,,,,,


In [None]:
df_final.to_csv("columns.csv", index=False)