# arXiv　APIを叩いてみる
- 参考 https://note.nkmk.me/python-arxiv-api-download-rss/ 
    - 記事で扱っているversionが古い
- 参考 https://pypi.org/project/arxiv/ 
    - pypiのサイトにExampleが載っているた

In [2]:
# 使用するライブラリのインポート
import arxiv
import pandas as pd
import os

In [3]:
def fetch_papers_from_arxiv(search_query=None, max_results=3, csv_filename = "datas/arxiv_papers.csv"):
    """
    arXiv APIを使用して論文を取得し、CSVファイルに保存する関数

    Parameters:
        search_query (dict{"thema": "query"}): 検索クエリ
            ex) queries = {
                    "LLM": 'cat:cs.CL OR cat:cs.AI AND "large language model"',
                    "Machine Learning": 'cat:cs.LG OR cat:stat.ML AND "machine learning"',
                    "XAI": 'cat:cs.AI OR cat:cs.LG AND "explainable AI"'
                }
        max_results (int): 取得する論文の最大数
        csv_filename (str): 保存するCSVファイル名

    Returns:
        DataFrame: 新しい論文情報を含むDataFrame
    """
    if search_query is None:
        # テーマごとのクエリ
        queries = {
            "LLM": 'cat:cs.CL OR cat:cs.AI AND "large language model"',
            "Machine Learning": 'cat:cs.LG OR cat:stat.ML AND "machine learning"',
            "XAI": 'cat:cs.AI OR cat:cs.LG AND "explainable AI"'
        }
    
    # 新しく取得した論文の情報を格納するリスト
    new_papers_list = []

    # 既存のCSVファイルを読み込む（存在すれば）
    if os.path.exists(csv_filename):
        existing_papers_df = pd.read_csv(csv_filename)
    else:
        # CSVが存在しない場合は空のDataFrameを作成
        existing_papers_df = pd.DataFrame(columns=["Title", "Authors", "Published", "URL", "Abstract", "PDF"])

    # 既存のURL一覧を取得
    existing_urls = existing_papers_df["URL"].tolist()

    # 各クエリで論文を取得し、既存のデータと比較
    for theme, query in queries.items():
        search = arxiv.Search(
            query=query,
            max_results=5,  # 各テーマで取得する最大論文数
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        for result in search.results():
            # 論文情報を辞書にまとめる
            paper_info = {
                "Title": result.title,
                "Theme": theme,
                "Authors": ', '.join([author.name for author in result.authors]),
                "Published": str(result.published),
                "URL": result.entry_id,
                "Abstract": result.summary,
                "PDF": result.pdf_url
            }

            # 既存の論文リストにないか、または更新されている場合はリストに追加
            if paper_info["URL"] not in existing_urls or paper_info["Published"] != existing_papers_df.loc[existing_papers_df["URL"] == paper_info["URL"], "Published"].values[0]:
                new_papers_list.append(paper_info)

    # 新しい論文があれば処理
    if new_papers_list:
        # 新しい論文のDataFrameを作成
        new_papers_df = pd.DataFrame(new_papers_list)

        # 既存の論文と新しい論文を結合
        updated_papers_df = pd.concat([existing_papers_df, new_papers_df], ignore_index=True)

        # CSVファイルに上書き保存
        updated_papers_df.to_csv(csv_filename, index=False)

        # print(f"{len(new_papers_list)} 件の新しい論文を追加しました。")

    return new_papers_df


In [4]:
fetch_papers_from_arxiv()

  for result in search.results():


Unnamed: 0,Title,Theme,Authors,Published,URL,Abstract,PDF
0,To CoT or not to CoT? Chain-of-thought helps m...,LLM,"Zayne Sprague, Fangcong Yin, Juan Diego Rodrig...",2024-09-18 17:55:00+00:00,http://arxiv.org/abs/2409.12183v1,Chain-of-thought (CoT) via prompting is the de...,http://arxiv.org/pdf/2409.12183v1
1,Decoding Style: Efficient Fine-Tuning of LLMs ...,LLM,"Najmeh Forouzandehmehr, Nima Farrokhsiar, Rami...",2024-09-18 17:15:06+00:00,http://arxiv.org/abs/2409.12150v1,Personalized outfit recommendation remains a c...,http://arxiv.org/pdf/2409.12150v1
2,Takin: A Cohort of Superior Quality Zero-shot ...,LLM,"EverestAI, :, Sijin Chen, Yuan Feng, Laipeng H...",2024-09-18 17:03:12+00:00,http://arxiv.org/abs/2409.12139v1,With the advent of the big data and large lang...,http://arxiv.org/pdf/2409.12139v1
3,Qwen2.5-Math Technical Report: Toward Mathemat...,LLM,"An Yang, Beichen Zhang, Binyuan Hui, Bofei Gao...",2024-09-18 16:45:37+00:00,http://arxiv.org/abs/2409.12122v1,"In this report, we present a series of math-sp...",http://arxiv.org/pdf/2409.12122v1
4,Measuring Human and AI Values based on Generat...,LLM,"Haoran Ye, Yuhang Xie, Yuanyi Ren, Hanjun Fang...",2024-09-18 16:26:22+00:00,http://arxiv.org/abs/2409.12106v1,Human values and their measurement are long-st...,http://arxiv.org/pdf/2409.12106v1
5,Fitting Multilevel Factor Models,Machine Learning,"Tetiana Parshakova, Trevor Hastie, Stephen Boyd",2024-09-18 15:39:12+00:00,http://arxiv.org/abs/2409.12067v1,We examine a special case of the multilevel fa...,http://arxiv.org/pdf/2409.12067v1
6,Cartan moving frames and the data manifolds,Machine Learning,"Eliot Tron, Rita Fioresi, Nicolas Couellan, St...",2024-09-18 15:31:29+00:00,http://arxiv.org/abs/2409.12057v1,The purpose of this paper is to employ the lan...,http://arxiv.org/pdf/2409.12057v1
7,Symmetry-Based Structured Matrices for Efficie...,Machine Learning,"Ashwin Samudre, Mircea Petrache, Brian D. Nord...",2024-09-18 07:52:33+00:00,http://arxiv.org/abs/2409.11772v1,There has been much recent interest in designi...,http://arxiv.org/pdf/2409.11772v1
8,Recurrent Interpolants for Probabilistic Time ...,Machine Learning,"Yu Chen, Marin Biloš, Sarthak Mittal, Wei Deng...",2024-09-18 03:52:48+00:00,http://arxiv.org/abs/2409.11684v1,Sequential models such as recurrent neural net...,http://arxiv.org/pdf/2409.11684v1
9,PieClam: A Universal Graph Autoencoder Based o...,Machine Learning,"Daniel Zilberg, Ron Levie",2024-09-18 00:49:42+00:00,http://arxiv.org/abs/2409.11618v1,We propose PieClam (Prior Inclusive Exclusive ...,http://arxiv.org/pdf/2409.11618v1
