# Huggingface AK and research community papers

Flow to get info from AK and research community papers

## Initial setup

In [1]:
import sys
import os

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from scripts.hf_scraper_script import HFPapersScraper
from scripts.arxiv_utils import *
from scripts.openalex_utils import *

## Getting a list of authors from the papers

### 1. Get list of authors from a list of papers on huggingface: [AK and community daily papers](https://huggingface.co/papers)

In [2]:
# initialize Huggingface scraper class
hf_papers_scraper = HFPapersScraper()

# get arxiv ids from the links
arxiv_ids = hf_papers_scraper.get_paper_ids()

# Loop through the arxiv ids and get the authors through the arxiv API and XML parsing
for arxiv_id in arxiv_ids:
    xml_content = get_content_by_id(arxiv_id)
    print(arxiv_id, extract_paper_info(xml_content)[0]['authors'])

2411.04923 ['Shehan Munasinghe', 'Hanan Gani', 'Wenqi Zhu', 'Jiale Cao', 'Eric Xing', 'Fahad Shahbaz Khan', 'Salman Khan']
2411.05000 ['Jonathan Roberts', 'Kai Han', 'Samuel Albanie']
2411.04075 ['Chuhan Li', 'Ziyao Shangguan', 'Yilun Zhao', 'Deyuan Li', 'Yixin Liu', 'Arman Cohan']
2411.04905 ['Siming Huang', 'Tianhao Cheng', 'Jason Klein Liu', 'Jiaran Hao', 'Liuyihan Song', 'Yang Xu', 'J. Yang', 'J. H. Liu', 'Chenchen Zhang', 'Linzheng Chai', 'Ruifeng Yuan', 'Zhaoxiang Zhang', 'Jie Fu', 'Qian Liu', 'Ge Zhang', 'Zili Wang', 'Yuan Qi', 'Yinghui Xu', 'Wei Chu']
2411.04928 ['Wenqiang Sun', 'Shuo Chen', 'Fangfu Liu', 'Zilong Chen', 'Yueqi Duan', 'Jun Zhang', 'Yikai Wang']
2411.04989 ['Koichi Namekata', 'Sherwin Bahmani', 'Ziyi Wu', 'Yash Kant', 'Igor Gilitschenski', 'David B. Lindell']
2411.04709 ['Wenhao Wang', 'Yi Yang']
2411.05003 ['David Junhao Zhang', 'Roni Paiss', 'Shiran Zada', 'Nikhil Karnad', 'David E. Jacobs', 'Yael Pritch', 'Inbar Mosseri', 'Mike Zheng Shou', 'Neal Wadhwa', 'Nat

### 2. Get list of paper authors by topic

#### 2a. ArXiv

In [7]:
# Can pass a topic or a list of topics
topic_list = ["negative sampling", "llm", ["reinforcement learning", "transformers"]]
for topic in topic_list:
    print(f"Topic: {topic}")
    print([({'title': paper['title'], 'author': paper['authors']}) for paper in extract_paper_info(get_content_by_topic(topic, filter_type='any'))])

Topic: negative sampling
[{'title': 'Entity Aware Negative Sampling with Auxiliary Loss of False Negative\n  Prediction for Knowledge Graph Embedding', 'author': ['Sang-Hyun Je']}, {'title': 'Does Negative Sampling Matter? A Review with Insights into its Theory\n  and Applications', 'author': ['Zhen Yang', 'Ming Ding', 'Tinglin Huang', 'Yukuo Cen', 'Junshuai Song', 'Bin Xu', 'Yuxiao Dong', 'Jie Tang']}, {'title': 'Bayesian Negative Sampling for Recommendation', 'author': ['Bin Liu', 'Bang Wang']}, {'title': 'Synthetic Hard Negative Samples for Contrastive Learning', 'author': ['Hengkui Dong', 'Xianzhong Long', 'Yun Li', 'Lei Chen']}, {'title': 'UFNRec: Utilizing False Negative Samples for Sequential Recommendation', 'author': ['Xiaoyang Liu', 'Chong Liu', 'Pinzheng Wang', 'Rongqin Zheng', 'Lixin Zhang', 'Leyu Lin', 'Zhijun Chen', 'Liangliang Fu']}, {'title': 'Rethinking Samples Selection for Contrastive Learning: Mining of\n  Potential Samples', 'author': ['Hengkui Dong', 'Xianzhong Lo

#### 2b. OpenAlex

In [2]:
topic_ids = get_topic_ids_by_keyword("artificial intelligence")
works = get_works_by_topic_id(topic_ids[0], print_url=True)

https://api.openalex.org/works?filter=topics.id:https://openalex.org/T10906


In [4]:
[(work['title'], [author['author']['display_name'] for author in work['authorships']]) for work in works[0:2]]

[('Probabilistic Reasoning in Intelligent Systems: Networks of Plausible Inference',
  ['Judea Pearl']),
 ('Bayesian Data Analysis',
  ['Andrew Gelman',
   'John B. Carlin',
   'Hal S. Stern',
   'David B. Dunson',
   'Aki Vehtari',
   'Donald B. Rubin'])]

### 3. Get list of papers using author name from ArXiv

In [4]:
author_name = "richard sutton"
extract_paper_info(get_content_by_author(format_author_name(author_name), 
                                                      print_url=True))

http://export.arxiv.org/api/query?search_query=au%3Asutton_richard


[{'title': 'True Online Emphatic TD($λ$): Quick Reference and Implementation\n  Guide',
  'authors': ['Richard S. Sutton'],
  'summary': 'This document is a guide to the implementation of true online emphatic\nTD($\\lambda$), a model-free temporal-difference algorithm for learning to make\nlong-term predictions which combines the emphasis idea (Sutton, Mahmood & White\n2015) and the true-online idea (van Seijen & Sutton 2014). The setting used\nhere includes linear function approximation, the possibility of off-policy\ntraining, and all the generality of general value functions, as well as the\nemphasis algorithm\'s notion of "interest".'},
 {'title': 'A History of Meta-gradient: Gradient Methods for Meta-learning',
  'authors': ['Richard S. Sutton'],
  'summary': 'The history of meta-learning methods based on gradient descent is reviewed,\nfocusing primarily on methods that adapt step-size (learning rate)\nmeta-parameters.'},
 {'title': 'The Quest for a Common Model of the Intelligent